diff options
Diffstat (limited to 'test/OpenMP')
32 files changed, 7319 insertions, 34 deletions
diff --git a/test/OpenMP/cancel_messages.cpp b/test/OpenMP/cancel_messages.cpp index e23b5c337bc8..f8bbe2c36bea 100644 --- a/test/OpenMP/cancel_messages.cpp +++ b/test/OpenMP/cancel_messages.cpp @@ -4,8 +4,16 @@ int main(int argc, char **argv) { #pragma omp cancellation // expected-error {{expected an OpenMP directive}} #pragma omp cancel // expected-error {{one of 'for', 'parallel', 'sections' or 'taskgroup' is expected}} ; +#pragma omp parallel + { +#pragma omp cancel // expected-error {{one of 'for', 'parallel', 'sections' or 'taskgroup' is expected}} + } #pragma omp cancel parallel untied // expected-error {{unexpected OpenMP clause 'untied' in directive '#pragma omp cancel'}} #pragma omp cancel unknown // expected-error {{one of 'for', 'parallel', 'sections' or 'taskgroup' is expected}} +#pragma omp parallel + { +#pragma omp cancel unknown // expected-error {{one of 'for', 'parallel', 'sections' or 'taskgroup' is expected}} + } #pragma omp cancel sections( // expected-warning {{extra tokens at the end of '#pragma omp cancel' are ignored}} #pragma omp cancel for, ) // expected-warning {{extra tokens at the end of '#pragma omp cancel' are ignored}} #pragma omp cancel taskgroup() // expected-warning {{extra tokens at the end of '#pragma omp cancel' are ignored}} diff --git a/test/OpenMP/cancellation_point_messages.cpp b/test/OpenMP/cancellation_point_messages.cpp index 2324915e83f8..21b0e9c2fa2d 100644 --- a/test/OpenMP/cancellation_point_messages.cpp +++ b/test/OpenMP/cancellation_point_messages.cpp @@ -4,8 +4,16 @@ int main(int argc, char **argv) { #pragma omp cancellation // expected-error {{expected an OpenMP directive}} #pragma omp cancellation point // expected-error {{one of 'for', 'parallel', 'sections' or 'taskgroup' is expected}} ; +#pragma omp parallel + { +#pragma omp cancellation point // expected-error {{one of 'for', 'parallel', 'sections' or 'taskgroup' is expected}} + } #pragma omp cancellation point parallel untied // expected-error {{unexpected OpenMP clause 'untied' in directive '#pragma omp cancellation point'}} #pragma omp cancellation point unknown // expected-error {{one of 'for', 'parallel', 'sections' or 'taskgroup' is expected}} +#pragma omp parallel + { +#pragma omp cancellation point unknown // expected-error {{one of 'for', 'parallel', 'sections' or 'taskgroup' is expected}} + } #pragma omp cancellation point sections( // expected-warning {{extra tokens at the end of '#pragma omp cancellation point' are ignored}} #pragma omp cancellation point for, ) // expected-warning {{extra tokens at the end of '#pragma omp cancellation point' are ignored}} #pragma omp cancellation point taskgroup() // expected-warning {{extra tokens at the end of '#pragma omp cancellation point' are ignored}} diff --git a/test/OpenMP/declare_reduction_messages.cpp b/test/OpenMP/declare_reduction_messages.cpp index a1373b1dcb9b..198d7756d035 100644 --- a/test/OpenMP/declare_reduction_messages.cpp +++ b/test/OpenMP/declare_reduction_messages.cpp @@ -1,4 +1,6 @@ // RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s +// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++98 %s +// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++11 %s int temp; // expected-note 7 {{'temp' declared here}} @@ -51,7 +53,17 @@ class Class2 : public Class1<T> { #pragma omp declare reduction(fun222 : long : omp_out += omp_in) // expected-error {{redefinition of user-defined reduction for type 'long'}} #pragma omp declare reduction(fun1 : long : omp_out += omp_in) initializer // expected-error {{expected '(' after 'initializer'}} #pragma omp declare reduction(fun2 : long : omp_out += omp_in) initializer { // expected-error {{expected '(' after 'initializer'}} expected-error {{expected expression}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}} -#pragma omp declare reduction(fun3 : long : omp_out += omp_in) initializer[ // expected-error {{expected '(' after 'initializer'}} expected-error {{expected expression}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}} +#pragma omp declare reduction(fun3 : long : omp_out += omp_in) initializer[ +#if __cplusplus <= 199711L +// expected-error@-2 {{expected '(' after 'initializer'}} +// expected-error@-3 {{expected expression}} +// expected-warning@-4 {{extra tokens at the end of '#pragma omp declare reduction' are ignored}} +#else +// expected-error@-6 {{expected '(' after 'initializer'}} +// expected-error@-7 {{expected variable name or 'this' in lambda capture list}} +// expected-error@-8 {{expected ')'}} +// expected-note@-9 {{to match this '('}} +#endif #pragma omp declare reduction(fun4 : long : omp_out += omp_in) initializer() // expected-error {{expected expression}} #pragma omp declare reduction(fun5 : long : omp_out += omp_in) initializer(temp) // expected-error {{only 'omp_priv' or 'omp_orig' variables are allowed in initializer expression}} #pragma omp declare reduction(fun6 : long : omp_out += omp_in) initializer(omp_orig // expected-error {{expected ')'}} expected-note {{to match this '('}} diff --git a/test/OpenMP/distribute_collapse_messages.cpp b/test/OpenMP/distribute_collapse_messages.cpp index 8818d659d441..4897e6931d27 100644 --- a/test/OpenMP/distribute_collapse_messages.cpp +++ b/test/OpenMP/distribute_collapse_messages.cpp @@ -1,8 +1,13 @@ // RUN: %clang_cc1 -verify -fopenmp %s +// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s +// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s void foo() { } +#if __cplusplus >= 201103L + // expected-note@+2 4 {{declared here}} +#endif bool foobool(int argc) { return argc; } @@ -29,6 +34,9 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; #pragma omp distribute collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp distribute', but found only 1}} +#if __cplusplus >= 201103L + // expected-note@+5 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}} +#endif // expected-error@+3 2 {{directive '#pragma omp distribute' cannot contain more than one 'collapse' clause}} // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}} // expected-error@+1 2 {{expression is not an integral constant expression}} @@ -36,7 +44,11 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; #pragma omp distribute collapse (S) // expected-error {{'S' does not refer to a value}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; - // expected-error@+1 2 {{expression is not an integral constant expression}} +#if __cplusplus <= 199711L + // expected-error@+4 2 {{expression is not an integral constant expression}} +#else + // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}} +#endif #pragma omp distribute collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; #pragma omp distribute collapse (1) @@ -59,8 +71,14 @@ int main(int argc, char **argv) { for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute', but found only 1}} #pragma omp distribute collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute' are ignored}} expected-note {{as specified in 'collapse' clause}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute', but found only 1}} +#if __cplusplus >= 201103L + // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}} +#endif #pragma omp distribute collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; +#if __cplusplus >= 201103L + // expected-note@+5 {{non-constexpr function 'foobool' cannot be used in a constant expression}} +#endif // expected-error@+3 {{expression is not an integral constant expression}} // expected-error@+2 2 {{directive '#pragma omp distribute' cannot contain more than one 'collapse' clause}} // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}} @@ -68,7 +86,11 @@ int main(int argc, char **argv) { for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; #pragma omp distribute collapse (S1) // expected-error {{'S1' does not refer to a value}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; - // expected-error@+1 {{expression is not an integral constant expression}} +#if __cplusplus <= 199711L + // expected-error@+4 {{expression is not an integral constant expression}} +#else + // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}} +#endif #pragma omp distribute collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error@+3 {{statement after '#pragma omp distribute' must be a for loop}} diff --git a/test/OpenMP/distribute_firstprivate_codegen.cpp b/test/OpenMP/distribute_firstprivate_codegen.cpp new file mode 100644 index 000000000000..718fc875254c --- /dev/null +++ b/test/OpenMP/distribute_firstprivate_codegen.cpp @@ -0,0 +1,382 @@ +// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 +// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 +// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32 +// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32 + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +template <class T> +struct S { + T f; + S(T a) : f(a) {} + S() : f() {} + operator T() { return T(); } + ~S() {} +}; + +// CHECK: [[S_FLOAT_TY:%.+]] = type { float } +// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} } +template <typename T> +T tmain() { + S<T> test; + T t_var = T(); + T vec[] = {1, 2}; + S<T> s_arr[] = {1, 2}; + S<T> &var = test; + #pragma omp target + #pragma omp teams +#pragma omp distribute firstprivate(t_var, vec, s_arr, s_arr, var, var) + for (int i = 0; i < 2; ++i) { + vec[i] = t_var; + s_arr[i] = var; + } + return T(); +} + +int main() { + static int svar; + volatile double g; + volatile double &g1 = g; + + #ifdef LAMBDA + // LAMBDA-LABEL: @main + // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]]( + [&]() { + static float sfvar; + // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]]( + // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams( + // LAMBDA: call void [[OFFLOADING_FUN:@.+]]( + + // LAMBDA: define{{.+}} void [[OFFLOADING_FUN]]( + // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, {{.+}}, {{.+}}* [[OMP_OUTLINED:@.+]] to {{.+}}) + #pragma omp target + #pragma omp teams +#pragma omp distribute firstprivate(g, g1, svar, sfvar) + for (int i = 0; i < 2; ++i) { + // LAMBDA-64: define{{.*}} internal{{.*}} void [[OMP_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i{{[0-9]+}} [[G_IN:%.+]], i{{[0-9]+}} [[G1_IN:%.+]], i{{[0-9]+}} [[SVAR_IN:%.+]], i{{[0-9]+}} [[SFVAR_IN:%.+]]) + // LAMBDA-32: define internal{{.*}} void [[OMP_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, double*{{.*}} [[G_IN:%.+]], i{{[0-9]+}} [[G1_IN:%.+]], i{{[0-9]+}} [[SVAR_IN:%.+]], i{{[0-9]+}} [[SFVAR_IN:%.+]]) + // Private alloca's for conversion + // LAMBDA-64: [[G_ADDR:%.+]] = alloca i{{[0-9]+}}, + // LAMBDA-32: [[G_ADDR:%.+]] = alloca double*, + // LAMBDA: [[G1_ADDR:%.+]] = alloca i{{[0-9]+}}, + // LAMBDA: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}}, + // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca i{{[0-9]+}}, + // LAMBDA: [[G1_REF:%.+]] = alloca double*, + // LAMBDA: [[TMP:%.+]] = alloca double*, + + // Actual private variables to be used in the body (tmp is used for the reference type) + // LAMBDA: [[G_PRIVATE:%.+]] = alloca double, + // LAMBDA: [[G1_PRIVATE:%.+]] = alloca double, + // LAMBDA: [[TMP_PRIVATE:%.+]] = alloca double*, + // LAMBDA: [[SVAR_PRIVATE:%.+]] = alloca i{{[0-9]+}}, + // LAMBDA: [[SFVAR_PRIVATE:%.+]] = alloca float, + + // Store input parameter addresses into private alloca's for conversion + // LAMBDA-64: store i{{[0-9]+}} [[G_IN]], i{{[0-9]+}}* [[G_ADDR]], + // LAMBDA-32: store double* [[G_IN]], double** [[G_ADDR]], + // LAMBDA: store i{{[0-9]+}} [[G1_IN]], i{{[0-9]+}}* [[G1_ADDR]], + // LAMBDA: store i{{[0-9]+}} [[SVAR_IN]], i{{[0-9]+}}* [[SVAR_ADDR]], + // LAMBDA: store i{{[0-9]+}} [[SFVAR_IN]], i{{[0-9]+}}* [[SFVAR_ADDR]], + + // LAMBDA-64-DAG: [[G_CONV:%.+]] = bitcast i{{[0-9]+}}* [[G_ADDR]] to double* + // LAMBDA-32-DAG: [[G_ADDR_VAL:%.+]] = load double*, double** [[G_ADDR]], + // LAMBDA-DAG: [[G1_CONV:%.+]] = bitcast i{{[0-9]+}}* [[G1_ADDR]] to double* + // LAMBDA-DAG: store double* [[G1_CONV]], double** [[G1_REF]], + // LAMBDA-64-DAG: [[SVAR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[SVAR_ADDR]] to i{{[0-9]+}}* + // LAMBDA-DAG: [[SFVAR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[SFVAR_ADDR]] to float* + // LAMBDA-DAG: [[G1_REF_VAL:%.+]] = load double*, double** [[G1_REF]], + // LAMBDA-DAG: store double* [[G1_REF_VAL]], double** [[TMP]], + // LAMBDA-64-DAG: [[G_CONV_VAL:%.+]] = load{{.*}} double, double* [[G_CONV]], + // LAMBDA-32-DAG: [[G_CONV_VAL:%.+]] = load{{.*}} double, double* [[G_ADDR_VAL]], + // LAMBDA-DAG: store double [[G_CONV_VAL]], double* [[G_PRIVATE]], + // LAMBDA-DAG: [[TMP_VAL:%.+]] = load double*, double** [[TMP]], + // LAMBDA-DAG: [[TMP_VAL_VAL:%.+]] = load{{.*}} double, double* [[TMP_VAL]], + // LAMBDA-DAG: store double [[TMP_VAL_VAL]], double* [[G1_PRIVATE]], + // LAMBDA-DAG: store double* [[G1_PRIVATE]], double** [[TMP_PRIVATE]], + // LAMBDA-64-DAG: [[SVAR_CONV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_CONV]], + // LAMBDA-32-DAG: [[SVAR_CONV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_ADDR]], + // LAMBDA-DAG: store i{{[0-9]+}} [[SVAR_CONV_VAL]], i{{[0-9]+}}* [[SVAR_PRIVATE]], + // LAMBDA-DAG: [[SFVAR_CONV_VAL:%.+]] = load float, float* [[SFVAR_CONV]], + // LAMBDA-DAG: store float [[SFVAR_CONV_VAL]], float* [[SFVAR_PRIVATE]], + // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4( + g += 1; + g1 += 1; + svar += 3; + sfvar += 4.0; + // LAMBDA-DAG: [[G_VAL:%.+]] = load double, double* [[G_PRIVATE]], + // LAMBDA-DAG: [[G_NEXT:%.+]] = fadd double [[G_VAL]], 1.{{.+}} + // LAMBDA-DAG: store double [[G_NEXT]], double* [[G_PRIVATE]], + // LAMBDA-DAG: [[TMP_VAL1:%.+]] = load double*, double** [[TMP_PRIVATE]], + // LAMBDA-DAG: [[TMP_VAL_VAL1:%.+]] = load{{.*}} double, double* [[TMP_VAL1]], + // LAMBDA-DAG: [[TMP_ADD:%.+]] = fadd double [[TMP_VAL_VAL1]], 1.{{.+}} + // LAMBDA-DAG: store{{.*}} double [[TMP_ADD]], double* [[TMP_VAL1]], + // LAMBDA-DAG: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_PRIVATE]], + // LAMBDA-DAG: [[SVAR_ADD:%.+]] = add{{.*}} i{{[0-9]+}} [[SVAR_VAL]], 3 + // LAMBDA-DAG: store i{{[0-9]+}} [[SVAR_ADD]], i{{[0-9]+}}* [[SVAR_PRIVATE]], + // LAMBDA-DAG: [[SFVAR_VAL:%.+]] = load float, float* [[SFVAR_PRIVATE]], + // LAMBDA-DAG: [[SFVAR_CONV_VAL1:%.+]] = fpext float [[SFVAR_VAL]] to double + // LAMBDA-DAG: [[SFVAR_ADD:%.+]] = fadd double [[SFVAR_CONV_VAL1]], 4.{{.+}} + // LAMBDA-DAG: [[SFVAR_CONV_VAL2:%.+]] = fptrunc double [[SFVAR_ADD]] to float + // LAMBDA-DAG: store float [[SFVAR_CONV_VAL2:%.+]], float* [[SFVAR_PRIVATE]], + + // call inner lambda (use refs to private alloca's) + // LAMBDA: [[GEP_0:%.+]] = getelementptr{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // LAMBDA: store double* [[G_PRIVATE]], double** [[GEP_0]], + // LAMBDA: [[GEP_1:%.+]] = getelementptr{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // LAMBDA: [[TMP_PAR:%.+]] = load double*, double** [[TMP_PRIVATE]], + // LAMBDA: store double* [[TMP_PAR]], double** [[GEP_1]], + // LAMBDA: [[GEP_2:%.+]] = getelementptr{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2 + // LAMBDA: store i{{[0-9]+}}* [[SVAR_PRIVATE]], i{{[0-9]+}}** [[GEP_2]], + // LAMBDA: [[GEP_3:%.+]] = getelementptr{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3 + // LAMBDA: store float* [[SFVAR_PRIVATE]], float** [[GEP_3]], + // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}* {{.+}}) + // LAMBDA: call {{.*}}void @__kmpc_for_static_fini( + [&]() { + // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]]) + // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]], + g += 2; + g1 += 2; + svar += 4; + sfvar += 8.0; + // LAMBDA-DAG: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]] + // LAMBDA-DAG: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // LAMBDA-DAG: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]], + // LAMBDA-DAG: [[G_REF_VAL:%.+]] = load double, double* [[G_REF]], + // LAMBDA-DAG: [[G_REF_ADD:%.+]] = fadd double [[G_REF_VAL]], 2.{{.+}} + // LAMBDA-DAG: store double [[G_REF_ADD]], double* [[G_REF]] + + // LAMBDA-DAG: [[TMP_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // LAMBDA-DAG: [[G1_REF:%.+]] = load double*, double** [[TMP_PTR_REF]] + // LAMBDA-DAG: [[G1_REF_VAL:%.+]] = load double, double* [[G1_REF]], + // LAMBDA-DAG: [[G1_ADD:%.+]] = fadd double [[G1_REF_VAL]], 2.{{.+}} + // LAMBDA-DAG: store double [[G1_ADD]], double* [[G1_REF]], + + // LAMBDA-DAG: [[SVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2 + // LAMBDA-DAG: [[SVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PTR_REF]] + // LAMBDA-DAG: [[SVAR_REF_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_REF]] + // LAMBDA-DAG: [[SVAR_ADD:%.+]] = add{{.*}} i{{[0-9]+}} [[SVAR_REF_VAL]], 4 + // LAMBDA-DAG: store i{{[0-9]+}} [[SVAR_ADD]], i{{[0-9]+}}* [[SVAR_REF]] + + // LAMBDA-DAG: [[SFVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3 + // LAMBDA-DAG: [[SFVAR_REF:%.+]] = load float*, float** [[SFVAR_PTR_REF]] + // LAMBDA-DAG: [[SFVAR_REF_VAL:%.+]] = load float, float* [[SFVAR_REF]] + // LAMBDA-DAG: [[SFVAR_REF_CONV:%.+]] = fpext float [[SFVAR_REF_VAL]] to double + // LAMBDA-DAG: [[SFVAR_ADD:%.+]] = fadd double [[SFVAR_REF_CONV]], 8.{{.+}} + // LAMBDA-DAG: [[SFVAR_ADD_CONV:%.+]] = fptrunc double [[SFVAR_ADD]] to float + // LAMBDA-DAG: store float [[SFVAR_ADD_CONV]], float* [[SFVAR_REF]], + }(); + } + }(); + return 0; + #else + S<float> test; + int t_var = 0; + int vec[] = {1, 2}; + S<float> s_arr[] = {1, 2}; + S<float> &var = test; + + #pragma omp target + #pragma omp teams + #pragma omp distribute firstprivate(t_var, vec, s_arr, s_arr, var, var, svar) + for (int i = 0; i < 2; ++i) { + vec[i] = t_var; + s_arr[i] = var; + } + return tmain<int>(); + #endif +} + +// CHECK: define{{.*}} i{{[0-9]+}} @main() +// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]], +// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]]) +// CHECK: call i{{[0-9]+}} @__tgt_target_teams( +// CHECK: call void [[OFFLOAD_FUN:@.+]]( +// CHECK: ret + +// CHECK: define{{.+}} [[OFFLOAD_FUN]]( +// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i{{[0-9]+}}, [2 x i{{[0-9]+}}]*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}})* [[OMP_OUTLINED:@.+]] to void +// CHECK: ret +// +// CHECK: define internal void [[OMP_OUTLINED]](i{{[0-9]+}}*{{.+}}, i{{[0-9]+}}*{{.+}}, i{{[0-9]+}} [[T_VAR_IN:%.+]], [2 x i{{[0-9]+}}]*{{.*}} [[VEC_IN:%.+]], [2 x [[S_FLOAT_TY]]]*{{.*}} [[S_ARR_IN:%.+]], [[S_FLOAT_TY]]*{{.*}} [[VAR_IN:%.+]], i{{[0-9]+}} [[SVAR_IN:%.+]]) + +// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}}, +// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*, +// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*, +// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*, +// CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}}, +// CHECK: [[TMP:%.+]] = alloca [[S_FLOAT_TY]]*, + +// discard omp loop variables +// CHECK: {{.*}} = alloca i{{[0-9]+}}, +// CHECK: {{.*}} = alloca i{{[0-9]+}}, +// CHECK: {{.*}} = alloca i{{[0-9]+}}, +// CHECK: {{.*}} = alloca i{{[0-9]+}}, +// CHECK: {{.*}} = alloca i{{[0-9]+}}, + +// CHECK-DAG: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, +// CHECK-DAG: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], +// CHECK-DAG: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]], +// CHECK-DAG: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]], +// CHECK-DAG: [[TMP_PRIV:%.+]] = alloca [[S_FLOAT_TY]]*, +// CHECK: [[SVAR_PRIV:%.+]] = alloca i{{[0-9]+}}, + +// CHECK: store i{{[0-9]+}} [[T_VAR_IN]], i{{[0-9]+}}* [[T_VAR_ADDR]], +// CHECK: store [2 x i{{[0-9]+}}]* [[VEC_IN]], [2 x i{{[0-9]+}}]** [[VEC_ADDR]], +// CHECK: store [2 x [[S_FLOAT_TY]]]* [[S_ARR_IN]], [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]], +// CHECK: store [[S_FLOAT_TY]]* [[VAR_IN]], [[S_FLOAT_TY]]** [[VAR_ADDR]], +// CHECK: store i{{[0-9]+}} [[SVAR_IN]], i{{[0-9]+}}* [[SVAR_ADDR]], + +// init t_var +// CHECK-64-DAG: [[T_VAR_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR_ADDR]] to i{{[0-9]+}}* +// CHECK-64-DAG: [[T_VAR_ADDR_CONV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_ADDR_CONV]], +// CHECK-32-DAG: [[T_VAR_ADDR_CONV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_ADDR]], +// CHECK-DAG: store i{{[0-9]+}} [[T_VAR_ADDR_CONV_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]], + +// init vec +// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]], +// CHECK-DAG: [[VEC_ADDR_VAL_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i{{[0-9]+}}* +// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i{{[0-9]+}}* +// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* [[VEC_PRIV_BCAST]], i{{[0-9]+}}* [[VEC_ADDR_VAL_BCAST]],{{.+}}) + +// init s_arr +// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]], +// CHECK-DAG: [[S_ARR_ADDR_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_VAL]] to [[S_FLOAT_TY]]* +// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.+}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]]{{.+}} +// CHECK-DAG: [[S_ARR_PRIV_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BGN]]{{.+}} +// CHECK-DAG: [[S_ARR_IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BGN]], [[S_ARR_PRIV_NEXT]] +// CHECK-DAG: br i1 [[S_ARR_IS_EMPTY]], label %[[S_ARR_CPY_DONE:.+]], label %[[S_ARR_CPY_BODY:.+]] + +// CHECK-DAG: [[S_ARR_CPY_BODY]]: +// CHECK-DAG: [[S_ARR_SRC_PAST:%.+]] = phi{{.+}} [ [[S_ARR_ADDR_BCAST]],{{.+}} ], [ [[S_ARR_SRC:%.+]],{{.+}} ] +// CHECK-DAG: [[S_ARR_DST_PAST:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]],{{.+}} ], [ [[S_ARR_DST:%.+]],{{.+}} ] +// CHECK-DAG: [[S_ARR_SRC_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_SRC_PAST]] to{{.+}} +// CHECK-DAG: [[S_ARR_DST_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_DST_PAST]] to{{.+}} +// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[S_ARR_DST_BCAST]], {{.+}}* [[S_ARR_SRC_BCAST]]{{.+}}) +// CHECK-DAG: [[S_ARR_SRC]] = getelementptr{{.+}} +// CHECK-DAG: [[S_ARR_DST]] = getelementptr{{.+}} +// CHECK-DAG: [[S_ARR_CPY_FIN:%.+]] = icmp{{.+}} [[S_ARR_DST]], [[S_ARR_PRIV_NEXT]] +// CHECK-DAG: br i1 [[S_ARR_CPY_FIN]], label %[[S_ARR_CPY_DONE]], label %[[S_ARR_CPY_BODY]] +// CHECK-DAG: [[S_ARR_CPY_DONE]]: + +// init var +// CHECK-DAG: [[VAR_ADDR_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_ADDR]], +// CHECK-DAG: store{{.+}} [[VAR_ADDR_VAL]],{{.+}} [[TMP]], +// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]], +// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_PRIV]] to{{.+}} +// CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to{{.+}} +// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_BCAST]],{{.+}}) +// CHECK-DAG: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]], + +// init svar +// CHECK-64-DAG: [[SVAR_ADDR_CONV:%.+]] = bitcast{{.+}} [[SVAR_ADDR]] to{{.+}} +// CHECK-64-DAG: [[SVAR_CONV_VAL:%.+]] = load{{.+}},{{.+}} [[SVAR_ADDR_CONV]], +// CHECK-32-DAG: [[SVAR_CONV_VAL:%.+]] = load{{.+}},{{.+}} [[SVAR_ADDR]], +// CHECK-DAG: store{{.+}} [[SVAR_CONV_VAL]],{{.+}} [[SVAR_PRIV]], + +// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* %.omp{{.+}}, +// CHECK-DAG: store i{{[0-9]+}} 1, i{{[0-9]+}}* %.omp{{.+}}, +// CHECK-DAG: store i{{[0-9]+}} 1, i{{[0-9]+}}* %.omp{{.+}}, +// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* %.omp{{.+}}, + +// CHECK: call void @__kmpc_for_static_init_4( +// CHECK: call void @__kmpc_for_static_fini( +// CHECK: ret void + +// Template +// CHECK: define{{.*}} i{{[0-9]+}} [[TMAIN_INT:@.+]]() +// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]], +// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]]) +// CHECK: call i{{[0-9]+}} @__tgt_target_teams( +// CHECK: call void [[OFFLOAD_FUN_1:@.+]]( +// CHECK: ret + +// CHECK: define{{.+}} [[OFFLOAD_FUN_1]]( +// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i{{[0-9]+}}, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[OMP_OUTLINED_1:@.+]] to void +// CHECK: ret +// +// CHECK: define internal void [[OMP_OUTLINED_1]](i{{[0-9]+}}*{{.+}}, i{{[0-9]+}}*{{.+}}, i{{[0-9]+}} [[T_VAR_IN:%.+]], [2 x i{{[0-9]+}}]*{{.*}} [[VEC_IN:%.+]], [2 x [[S_INT_TY]]]*{{.*}} [[S_ARR_IN:%.+]], [[S_INT_TY]]*{{.*}} [[VAR_IN:%.+]]) + +// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}}, +// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*, +// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*, +// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*, +// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*, + +// discard omp loop variables +// CHECK: {{.*}} = alloca i{{[0-9]+}}, +// CHECK: {{.*}} = alloca i{{[0-9]+}}, +// CHECK: {{.*}} = alloca i{{[0-9]+}}, +// CHECK: {{.*}} = alloca i{{[0-9]+}}, +// CHECK: {{.*}} = alloca i{{[0-9]+}}, + +// CHECK-DAG: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, +// CHECK-DAG: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], +// CHECK-DAG: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]], +// CHECK-DAG: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]], +// CHECK-DAG: [[TMP_PRIV:%.+]] = alloca [[S_INT_TY]]*, + +// CHECK: store i{{[0-9]+}} [[T_VAR_IN]], i{{[0-9]+}}* [[T_VAR_ADDR]], +// CHECK: store [2 x i{{[0-9]+}}]* [[VEC_IN]], [2 x i{{[0-9]+}}]** [[VEC_ADDR]], +// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_IN]], [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]], +// CHECK: store [[S_INT_TY]]* [[VAR_IN]], [[S_INT_TY]]** [[VAR_ADDR]], + +// init t_var +// CHECK-64-DAG: [[T_VAR_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR_ADDR]] to i{{[0-9]+}}* +// CHECK-64-DAG: [[T_VAR_ADDR_CONV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_ADDR_CONV]], +// CHECK-32-DAG: [[T_VAR_ADDR_CONV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_ADDR]], +// CHECK-DAG: store i{{[0-9]+}} [[T_VAR_ADDR_CONV_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]], + +// init vec +// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]], +// CHECK-DAG: [[VEC_ADDR_VAL_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i{{[0-9]+}}* +// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i{{[0-9]+}}* +// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* [[VEC_PRIV_BCAST]], i{{[0-9]+}}* [[VEC_ADDR_VAL_BCAST]],{{.+}}) + +// init s_arr +// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]], +// CHECK-DAG: [[S_ARR_ADDR_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_VAL]] to [[S_INT_TY]]* +// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.+}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]]{{.+}} +// CHECK-DAG: [[S_ARR_PRIV_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BGN]]{{.+}} +// CHECK-DAG: [[S_ARR_IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_PRIV_BGN]], [[S_ARR_PRIV_NEXT]] +// CHECK-DAG: br i1 [[S_ARR_IS_EMPTY]], label %[[S_ARR_CPY_DONE:.+]], label %[[S_ARR_CPY_BODY:.+]] + +// CHECK-DAG: [[S_ARR_CPY_BODY]]: +// CHECK-DAG: [[S_ARR_SRC_PAST:%.+]] = phi{{.+}} [ [[S_ARR_ADDR_BCAST]],{{.+}} ], [ [[S_ARR_SRC:%.+]],{{.+}} ] +// CHECK-DAG: [[S_ARR_DST_PAST:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]],{{.+}} ], [ [[S_ARR_DST:%.+]],{{.+}} ] +// CHECK-DAG: [[S_ARR_SRC_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_SRC_PAST]] to{{.+}} +// CHECK-DAG: [[S_ARR_DST_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_DST_PAST]] to{{.+}} +// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[S_ARR_DST_BCAST]], {{.+}}* [[S_ARR_SRC_BCAST]]{{.+}}) +// CHECK-DAG: [[S_ARR_SRC]] = getelementptr{{.+}} +// CHECK-DAG: [[S_ARR_DST]] = getelementptr{{.+}} +// CHECK-DAG: [[S_ARR_CPY_FIN:%.+]] = icmp{{.+}} [[S_ARR_DST]], [[S_ARR_PRIV_NEXT]] +// CHECK-DAG: br i1 [[S_ARR_CPY_FIN]], label %[[S_ARR_CPY_DONE]], label %[[S_ARR_CPY_BODY]] +// CHECK-DAG: [[S_ARR_CPY_DONE]]: + +// init var +// CHECK-DAG: [[VAR_ADDR_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR]], +// CHECK-DAG: store{{.+}} [[VAR_ADDR_VAL]],{{.+}} [[TMP]], +// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP]], +// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to{{.+}} +// CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to{{.+}} +// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_BCAST]],{{.+}}) +// CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP_PRIV]], + +// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* %.omp{{.+}}, +// CHECK-DAG: store i{{[0-9]+}} 1, i{{[0-9]+}}* %.omp{{.+}}, +// CHECK-DAG: store i{{[0-9]+}} 1, i{{[0-9]+}}* %.omp{{.+}}, +// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* %.omp{{.+}}, + +// CHECK: call void @__kmpc_for_static_init_4( +// CHECK: call void @__kmpc_for_static_fini( +// CHECK: ret void +#endif diff --git a/test/OpenMP/distribute_lastprivate_codegen.cpp b/test/OpenMP/distribute_lastprivate_codegen.cpp index 0f596defbfe9..abcbfd8d69c4 100644 --- a/test/OpenMP/distribute_lastprivate_codegen.cpp +++ b/test/OpenMP/distribute_lastprivate_codegen.cpp @@ -229,8 +229,6 @@ int main() { // the distribute loop // CHECK: call void @__kmpc_for_static_init_4( // assignment: vec[i] = t_var; -// the following is extremely weak, because it assumes ordering of this load with no reference after the call to static_init: fix!!!! -// CHECK: [[IV_VAL:%.+]] = // CHECK: [[T_VAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]], // CHECK: [[VEC_PTR:%.+]] = getelementptr inbounds [2 x i{{[0-9]+}}], [2 x i{{[0-9]+}}]* [[VEC_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} {{.+}} // CHECK: store i{{[0-9]+}} [[T_VAR_PRIV_VAL]], i{{[0-9]+}}* [[VEC_PTR]], diff --git a/test/OpenMP/linking.c b/test/OpenMP/linking.c index 7b3059242370..71e978f7fe48 100644 --- a/test/OpenMP/linking.c +++ b/test/OpenMP/linking.c @@ -7,42 +7,42 @@ // RUN: -fopenmp -target i386-unknown-linux -rtlib=platform \ // RUN: | FileCheck --check-prefix=CHECK-LD-32 %s // CHECK-LD-32: "{{.*}}ld{{(.exe)?}}" -// CHECK-LD-32: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc" +// CHECK-LD-32: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" // CHECK-LD-32: "-lpthread" "-lc" // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -fopenmp -target x86_64-unknown-linux -rtlib=platform \ // RUN: | FileCheck --check-prefix=CHECK-LD-64 %s // CHECK-LD-64: "{{.*}}ld{{(.exe)?}}" -// CHECK-LD-64: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc" +// CHECK-LD-64: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" // CHECK-LD-64: "-lpthread" "-lc" // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -fopenmp=libgomp -target i386-unknown-linux -rtlib=platform \ // RUN: | FileCheck --check-prefix=CHECK-GOMP-LD-32 %s // CHECK-GOMP-LD-32: "{{.*}}ld{{(.exe)?}}" -// CHECK-GOMP-LD-32: "-lgomp" "-lrt" "-lgcc" +// CHECK-GOMP-LD-32: "-lgomp" "-lrt" // CHECK-GOMP-LD-32: "-lpthread" "-lc" // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -fopenmp=libgomp -target x86_64-unknown-linux -rtlib=platform \ // RUN: | FileCheck --check-prefix=CHECK-GOMP-LD-64 %s // CHECK-GOMP-LD-64: "{{.*}}ld{{(.exe)?}}" -// CHECK-GOMP-LD-64: "-lgomp" "-lrt" "-lgcc" +// CHECK-GOMP-LD-64: "-lgomp" "-lrt" // CHECK-GOMP-LD-64: "-lpthread" "-lc" // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -fopenmp -target i386-unknown-linux -rtlib=platform \ // RUN: | FileCheck --check-prefix=CHECK-IOMP5-LD-32 %s // CHECK-IOMP5-LD-32: "{{.*}}ld{{(.exe)?}}" -// CHECK-IOMP5-LD-32: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc" +// CHECK-IOMP5-LD-32: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" // CHECK-IOMP5-LD-32: "-lpthread" "-lc" // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -fopenmp -target x86_64-unknown-linux -rtlib=platform \ // RUN: | FileCheck --check-prefix=CHECK-IOMP5-LD-64 %s // CHECK-IOMP5-LD-64: "{{.*}}ld{{(.exe)?}}" -// CHECK-IOMP5-LD-64: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" "-lgcc" +// CHECK-IOMP5-LD-64: "-l[[DEFAULT_OPENMP_LIB:[^"]*]]" // CHECK-IOMP5-LD-64: "-lpthread" "-lc" // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ @@ -60,7 +60,7 @@ // RUN: -rtlib=platform \ // RUN: | FileCheck --check-prefix=CHECK-LD-OVERRIDE-32 %s // CHECK-LD-OVERRIDE-32: "{{.*}}ld{{(.exe)?}}" -// CHECK-LD-OVERRIDE-32: "-lgomp" "-lrt" "-lgcc" +// CHECK-LD-OVERRIDE-32: "-lgomp" "-lrt" // CHECK-LD-OVERRIDE-32: "-lpthread" "-lc" // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ @@ -68,7 +68,7 @@ // RUN: -rtlib=platform \ // RUN: | FileCheck --check-prefix=CHECK-LD-OVERRIDE-64 %s // CHECK-LD-OVERRIDE-64: "{{.*}}ld{{(.exe)?}}" -// CHECK-LD-OVERRIDE-64: "-lgomp" "-lrt" "-lgcc" +// CHECK-LD-OVERRIDE-64: "-lgomp" "-lrt" // CHECK-LD-OVERRIDE-64: "-lpthread" "-lc" // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ diff --git a/test/OpenMP/nvptx_target_codegen.cpp b/test/OpenMP/nvptx_target_codegen.cpp index 59c4d5b277ce..dab0c5a082a5 100644 --- a/test/OpenMP/nvptx_target_codegen.cpp +++ b/test/OpenMP/nvptx_target_codegen.cpp @@ -8,6 +8,14 @@ #ifndef HEADER #define HEADER +// Check that the execution mode of all 6 target regions is set to Generic Mode. +// CHECK-DAG: {{@__omp_offloading_.+l98}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l175}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l284}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l321}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l339}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l304}}_exec_mode = weak constant i8 1 + template<typename tx, typename ty> struct TT{ tx X; @@ -23,7 +31,7 @@ int foo(int n) { double cn[5][n]; TT<long long, char> d; - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l90}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l98}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -54,7 +62,7 @@ int foo(int n) { // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l90]]() + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l98]]() // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() @@ -96,7 +104,7 @@ int foo(int n) { { } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l167}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l175}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -127,7 +135,7 @@ int foo(int n) { // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l167]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]]) + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l175]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]]) // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]], // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* @@ -169,7 +177,7 @@ int foo(int n) { aa += 1; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l276}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l284}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -200,7 +208,7 @@ int foo(int n) { // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l276]](i[[SZ]] + // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l284]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_B:%.+]] = alloca [10 x float]* @@ -353,7 +361,7 @@ int bar(int n){ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+313}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+321}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -384,7 +392,7 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l313]](i[[SZ]] + // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l321]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] @@ -439,7 +447,7 @@ int bar(int n){ - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l331}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l339}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -470,7 +478,7 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l331]]( + // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l339]]( // Create local storage for each capture. // CHECK: [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]* // CHECK: [[LOCAL_B:%.+]] = alloca i[[SZ]] @@ -529,7 +537,7 @@ int bar(int n){ - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l296}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l304}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -560,7 +568,7 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l296]](i[[SZ]] + // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l304]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] diff --git a/test/OpenMP/nvptx_target_parallel_codegen.cpp b/test/OpenMP/nvptx_target_parallel_codegen.cpp new file mode 100644 index 000000000000..7d1662435842 --- /dev/null +++ b/test/OpenMP/nvptx_target_parallel_codegen.cpp @@ -0,0 +1,136 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode. +// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 0 + +template<typename tx> +tx ftemplate(int n) { + tx a = 0; + short aa = 0; + tx b[10]; + + #pragma omp target parallel if(target: 0) + { + a += 1; + } + + #pragma omp target parallel map(tofrom: aa) + { + aa += 1; + } + + #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40) + { + a += 1; + aa += 1; + b[2] += 1; + } + + return a; +} + +int bar(int n){ + int a = 0; + + a += ftemplate<int>(n); + + return a; +} + + // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}} + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}( + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], + // CHECK: br label {{%?}}[[EXEC:.+]] + // + // CHECK: [[EXEC]] + // CHECK: {{call|invoke}} void [[OP1:@.+]](i32* null, i32* null, i16* [[AA]]) + // CHECK: br label {{%?}}[[DONE:.+]] + // + // CHECK: [[DONE]] + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + // CHECK: } + + // CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]]) + // CHECK: = alloca i32*, align + // CHECK: = alloca i32*, align + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align + // CHECK: store i16 {{%.+}}, i16* [[AA]], align + // CHECK: ret void + // CHECK: } + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}( + // CHECK: [[A_ADDR:%.+]] = alloca i32*, align + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align + // CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align + // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align + // CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align + // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align + // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], + // CHECK: br label {{%?}}[[EXEC:.+]] + // + // CHECK: [[EXEC]] + // CHECK: {{call|invoke}} void [[OP2:@.+]](i32* null, i32* null, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]]) + // CHECK: br label {{%?}}[[DONE:.+]] + // + // CHECK: [[DONE]] + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + // CHECK: } + + // CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]]) + // CHECK: = alloca i32*, align + // CHECK: = alloca i32*, align + // CHECK: [[A_ADDR:%.+]] = alloca i32*, align + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align + // CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align + // CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align + // CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align + // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align + // CHECK: store i32 {{%.+}}, i32* [[A]], align + // CHECK: store i16 {{%.+}}, i16* [[AA]], align + // CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], + // CHECK: store i32 {{%.+}}, i32* [[ELT]], align + // CHECK: ret void + // CHECK: } +#endif diff --git a/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp new file mode 100644 index 000000000000..bc423c1ee605 --- /dev/null +++ b/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp @@ -0,0 +1,126 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode. +// CHECK-DAG: {{@__omp_offloading_.+l21}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0 + +template<typename tx> +tx ftemplate(int n) { + tx a = 0; + short aa = 0; + tx b[10]; + + #pragma omp target parallel map(tofrom: aa) num_threads(1024) + { + aa += 1; + } + + #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40) num_threads(n) + { + a += 1; + aa += 1; + b[2] += 1; + } + + return a; +} + +int bar(int n){ + int a = 0; + + a += ftemplate<int>(n); + + return a; +} + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l21}}( + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], + // CHECK: br label {{%?}}[[EXEC:.+]] + // + // CHECK: [[EXEC]] + // CHECK-NOT: call void @__kmpc_push_num_threads + // CHECK: {{call|invoke}} void [[OP1:@.+]](i32* null, i32* null, i16* [[AA]]) + // CHECK: br label {{%?}}[[DONE:.+]] + // + // CHECK: [[DONE]] + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + // CHECK: } + + // CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]]) + // CHECK: = alloca i32*, align + // CHECK: = alloca i32*, align + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align + // CHECK: store i16 {{%.+}}, i16* [[AA]], align + // CHECK: ret void + // CHECK: } + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}( + // CHECK: [[A_ADDR:%.+]] = alloca i32*, align + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align + // CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align + // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align + // CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align + // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align + // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], + // CHECK: br label {{%?}}[[EXEC:.+]] + // + // CHECK: [[EXEC]] + // CHECK-NOT: call void @__kmpc_push_num_threads + // CHECK: {{call|invoke}} void [[OP2:@.+]](i32* null, i32* null, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]]) + // CHECK: br label {{%?}}[[DONE:.+]] + // + // CHECK: [[DONE]] + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + // CHECK: } + + // CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]]) + // CHECK: = alloca i32*, align + // CHECK: = alloca i32*, align + // CHECK: [[A_ADDR:%.+]] = alloca i32*, align + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align + // CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align + // CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align + // CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align + // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align + // CHECK: store i32 {{%.+}}, i32* [[A]], align + // CHECK: store i16 {{%.+}}, i16* [[AA]], align + // CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], + // CHECK: store i32 {{%.+}}, i32* [[ELT]], align + // CHECK: ret void + // CHECK: } +#endif diff --git a/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp new file mode 100644 index 000000000000..91c6de1b85d3 --- /dev/null +++ b/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp @@ -0,0 +1,106 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// Check that the execution mode of all 3 target regions on the gpu is set to SPMD Mode. +// CHECK-DAG: {{@__omp_offloading_.+l22}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 0 + +template<typename tx> +tx ftemplate(int n) { + tx a = 0; + short aa = 0; + tx b[10]; + + #pragma omp target parallel proc_bind(master) + { + } + + #pragma omp target parallel proc_bind(spread) + { + aa += 1; + } + + #pragma omp target parallel proc_bind(close) + { + a += 1; + aa += 1; + b[2] += 1; + } + + return a; +} + +int bar(int n){ + int a = 0; + + a += ftemplate<int>(n); + + return a; +} + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l22}}( + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: br label {{%?}}[[EXEC:.+]] + // + // CHECK: [[EXEC]] + // CHECK-NOT: call void @__kmpc_push_proc_bind + // CHECK: {{call|invoke}} void [[OP1:@.+]](i32* null, i32* null + // CHECK: br label {{%?}}[[DONE:.+]] + // + // CHECK: [[DONE]] + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + // CHECK: } + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}( + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: br label {{%?}}[[EXEC:.+]] + // + // CHECK: [[EXEC]] + // CHECK-NOT: call void @__kmpc_push_proc_bind + // CHECK: {{call|invoke}} void [[OP1:@.+]](i32* null, i32* null + // CHECK: br label {{%?}}[[DONE:.+]] + // + // CHECK: [[DONE]] + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + // CHECK: } + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}( + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: br label {{%?}}[[EXEC:.+]] + // + // CHECK: [[EXEC]] + // CHECK-NOT: call void @__kmpc_push_proc_bind + // CHECK: {{call|invoke}} void [[OP1:@.+]](i32* null, i32* null + // CHECK: br label {{%?}}[[DONE:.+]] + // + // CHECK: [[DONE]] + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + // CHECK: } +#endif diff --git a/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp new file mode 100644 index 000000000000..c4c3e977b0e2 --- /dev/null +++ b/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -0,0 +1,830 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. +// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64] + +// Check that the execution mode of all 3 target regions is set to Spmd Mode. +// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 + +template<typename tx> +tx ftemplate(int n) { + int a; + short b; + tx c; + float d; + double e; + + #pragma omp target parallel reduction(+: e) map(tofrom: e) + { + e += 5; + } + + #pragma omp target parallel reduction(^: c) reduction(*: d) map(tofrom: c,d) + { + c ^= 2; + d *= 33; + } + + #pragma omp target parallel reduction(|: a) reduction(max: b) map(tofrom: a,b) + { + a |= 1; + b = 99 > b ? 99 : b; + } + + return a+b+c+d+e; +} + +int bar(int n){ + int a = 0; + + a += ftemplate<char>(n); + + return a; +} + + // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}( + // + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: br label {{%?}}[[EXECUTE:.+]] + // + // CHECK: [[EXECUTE]] + // CHECK: {{call|invoke}} void [[PFN:@.+]](i32* + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // + // + // define internal void [[PFN]]( + // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align + // CHECK: [[EV:%.+]] = load double, double* [[E]], align + // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 + // CHECK: store double [[ADD]], double* [[E]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8* + // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) + // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ + // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] + + // CHECK: [[REDLABEL]] + // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align + // CHECK: [[EV:%.+]] = load double, double* [[E]], align + // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] + // CHECK: store double [[ADD]], double* [[E_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[DEFAULTLABEL]] + // + // CHECK: [[DEFAULTLABEL]] + // CHECK: ret + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], + // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], + // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], + // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], + // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] + // CHECK: store double [[RES]], double* [[VAR_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT:%.+]] = alloca double + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = bitcast double [[ELT_VAL]] to i64 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT_VAL:%.+]] = bitcast i64 [[REMOTE_ELT_VAL64]] to double + // + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align + // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* + // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + + + + + + + + + + // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}( + // + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: br label {{%?}}[[EXECUTE:.+]] + // + // CHECK: [[EXECUTE]] + // CHECK: {{call|invoke}} void [[PFN1:@.+]](i32* + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // + // + // define internal void [[PFN1]]( + // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align + // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align + // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} + // CHECK: store float [[MUL]], float* [[D]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: store i8* [[C]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8* + // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) + // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ + // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] + + // CHECK: [[REDLABEL]] + // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align + // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 + // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align + // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align + // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] + // CHECK: store float [[MUL]], float* [[D_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[DEFAULTLABEL]] + // + // CHECK: [[DEFAULTLABEL]] + // CHECK: ret + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* + // + // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], + // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 + // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], + // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: store float [[RES]], float* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca float + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 + // + // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align + // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = bitcast float [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = bitcast i32 [[REMOTE_ELT2_VAL32]] to float + // + // CHECK: store float [[REMOTE_ELT2_VAL]], float* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + + + + + + + + + + // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( + // + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: br label {{%?}}[[EXECUTE:.+]] + // + // CHECK: [[EXECUTE]] + // CHECK: {{call|invoke}} void [[PFN2:@.+]](i32* + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // + // + // define internal void [[PFN2]]( + // CHECK: store i32 0, i32* [[A:%.+]], align + // CHECK: store i16 -32768, i16* [[B:%.+]], align + // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align + // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 + // CHECK: store i32 [[OR]], i32* [[A]], align + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align + // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 + // CHECK: store i16 [[TRUNC]], i16* [[B]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* + // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* + // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) + // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ + // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] + + // CHECK: [[REDLABEL]] + // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align + // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align + // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] + // CHECK: store i32 [[OR]], i32* [[A_IN]], align + // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align + // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[DEFAULTLABEL]] + // + // CHECK: [[DEFAULTLABEL]] + // CHECK: ret + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* + // + // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], + // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], + // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 + // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 + // + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) + // + // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align + // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* + // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 + // + // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + +#endif diff --git a/test/OpenMP/nvptx_target_printf_codegen.c b/test/OpenMP/nvptx_target_printf_codegen.c new file mode 100644 index 000000000000..0de6885a5a72 --- /dev/null +++ b/test/OpenMP/nvptx_target_printf_codegen.c @@ -0,0 +1,116 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +#include <stdarg.h> + +// expected-no-diagnostics +extern int printf(const char *, ...); +extern int vprintf(const char *, va_list); + +// Check a simple call to printf end-to-end. +// CHECK: [[SIMPLE_PRINTF_TY:%[a-zA-Z0-9_]+]] = type { i32, i64, double } +int CheckSimple() { + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+CheckSimple.+]]_worker() +#pragma omp target + { + // Entry point. + // CHECK: define {{.*}}void [[T1]]() + // Alloca in entry block. + // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca [[SIMPLE_PRINTF_TY]] + + // CHECK: {{call|invoke}} void [[T1]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] + // + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] + + // printf in master-only basic block. + // CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt + const char* fmt = "%d %lld %f"; + // CHECK: [[PTR0:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 0 + // CHECK: store i32 1, i32* [[PTR0]], align 4 + // CHECK: [[PTR1:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 1 + // CHECK: store i64 2, i64* [[PTR1]], align 8 + // CHECK: [[PTR2:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 2 + + // CHECK: store double 3.0{{[^,]*}}, double* [[PTR2]], align 8 + // CHECK: [[BUF_CAST:%[0-9]+]] = bitcast [[SIMPLE_PRINTF_TY]]* [[BUF]] to i8* + // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF_CAST]]) + printf(fmt, 1, 2ll, 3.0); + } + + return 0; +} + +void CheckNoArgs() { + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+CheckNoArgs.+]]_worker() +#pragma omp target + { + // Entry point. + // CHECK: define {{.*}}void [[T2]]() + + // CHECK: {{call|invoke}} void [[T2]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] + // + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] + + // printf in master-only basic block. + // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}} + printf("hello, world!"); + } +} + +// Check that printf's alloca happens in the entry block, not inside the if +// statement. +int foo; +void CheckAllocaIsInEntryBlock() { + // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+CheckAllocaIsInEntryBlock.+]]_worker() +#pragma omp target + { + // Entry point. + // CHECK: define {{.*}}void [[T3]]( + // Alloca in entry block. + // CHECK: alloca %printf_args + + // CHECK: {{call|invoke}} void [[T3]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] + // + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] + + if (foo) { + printf("%d", 42); + } + } +} diff --git a/test/OpenMP/nvptx_target_teams_codegen.cpp b/test/OpenMP/nvptx_target_teams_codegen.cpp new file mode 100644 index 000000000000..e823eabbb10f --- /dev/null +++ b/test/OpenMP/nvptx_target_teams_codegen.cpp @@ -0,0 +1,222 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// Check that the execution mode of all 2 target regions is set to Generic Mode. +// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 1 + +template<typename tx> +tx ftemplate(int n) { + tx a = 0; + short aa = 0; + tx b[10]; + + #pragma omp target teams if(0) + { + b[2] += 1; + } + + #pragma omp target teams if(1) + { + a = '1'; + } + + #pragma omp target teams if(n>40) + { + aa = 1; + } + + return a; +} + +int bar(int n){ + int a = 0; + + a += ftemplate<char>(n); + + return a; +} + + // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l21}}_worker() + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker() + // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, + // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, + // CHECK: store i8* null, i8** [[OMP_WORK_FN]], + // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], + // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] + // + // CHECK: [[AWAIT_WORK]] + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 + // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 + // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null + // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] + // + // CHECK: [[SEL_WORKERS]] + // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] + // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 + // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] + // + // CHECK: [[EXEC_PARALLEL]] + // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] + // + // CHECK: [[TERM_PARALLEL]] + // CHECK: call void @__kmpc_kernel_end_parallel() + // CHECK: br label {{%?}}[[BAR_PARALLEL]] + // + // CHECK: [[BAR_PARALLEL]] + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: br label {{%?}}[[AWAIT_WORK]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]] [[A:%[^)]+]]) + // CHECK: store i[[SZ]] [[A]], i[[SZ]]* [[A_ADDR:%.+]], align + // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i8* + + // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] + // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] + // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] + // + // CHECK: [[WORKER]] + // CHECK: {{call|invoke}} void [[T1]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[CHECK_MASTER]] + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] + // + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] + // + // CHECK-NOT: kmpc_fork_teams + // CHECK: [[A_VAL:%.+]] = load i8, i8* [[CONV]], align + // CHECK: [[ACP:%.+]] = bitcast i[[SZ]]* [[AC:%.+]] to i8* + // CHECK: store i8 [[A_VAL]], i8* [[ACP]], align + // CHECK: [[ACV:%.+]] = load i[[SZ]], i[[SZ]]* [[AC]], align + // CHECK: store i[[SZ]] [[ACV]], i[[SZ]]* [[A_ADDR_T:%.+]], align + // CHECK: [[CONV2:%.+]] = bitcast i[[SZ]]* [[A_ADDR_T]] to i8* + // CHECK: store i8 49, i8* [[CONV2]], align + // CHECK: br label {{%?}}[[TERMINATE:.+]] + // + // CHECK: [[TERMINATE]] + // CHECK: call void @__kmpc_kernel_deinit() + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: br label {{%?}}[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}_worker() + // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, + // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, + // CHECK: store i8* null, i8** [[OMP_WORK_FN]], + // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], + // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] + // + // CHECK: [[AWAIT_WORK]] + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 + // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 + // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null + // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] + // + // CHECK: [[SEL_WORKERS]] + // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] + // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 + // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] + // + // CHECK: [[EXEC_PARALLEL]] + // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] + // + // CHECK: [[TERM_PARALLEL]] + // CHECK: call void @__kmpc_kernel_end_parallel() + // CHECK: br label {{%?}}[[BAR_PARALLEL]] + // + // CHECK: [[BAR_PARALLEL]] + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: br label {{%?}}[[AWAIT_WORK]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l31]](i[[SZ:32|64]] [[AA:%[^)]+]]) + // CHECK: store i[[SZ]] [[AA]], i[[SZ]]* [[AA_ADDR:%.+]], align + // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* + + // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] + // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] + // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] + // + // CHECK: [[WORKER]] + // CHECK: {{call|invoke}} void [[T2]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[CHECK_MASTER]] + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] + // + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] + // + // CHECK-NOT: kmpc_fork_teams + // CHECK: [[AA_VAL:%.+]] = load i16, i16* [[CONV]], align + // CHECK: [[ACP:%.+]] = bitcast i[[SZ]]* [[AC:%.+]] to i16* + // CHECK: store i16 [[AA_VAL]], i16* [[ACP]], align + // CHECK: [[ACV:%.+]] = load i[[SZ]], i[[SZ]]* [[AC]], align + // CHECK: store i[[SZ]] [[ACV]], i[[SZ]]* [[AA_ADDR_T:%.+]], align + // CHECK: [[CONV2:%.+]] = bitcast i[[SZ]]* [[AA_ADDR_T]] to i16* + // CHECK: store i16 1, i16* [[CONV2]], align + // CHECK: br label {{%?}}[[TERMINATE:.+]] + // + // CHECK: [[TERMINATE]] + // CHECK: call void @__kmpc_kernel_deinit() + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: br label {{%?}}[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + + +#endif diff --git a/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/test/OpenMP/nvptx_teams_reduction_codegen.cpp new file mode 100644 index 000000000000..ae129ebfae4d --- /dev/null +++ b/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -0,0 +1,1143 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. +// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64] + +// Check that the execution mode of all 3 target regions is set to Generic Mode. +// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 1 + +template<typename tx> +tx ftemplate(int n) { + int a; + short b; + tx c; + float d; + double e; + + #pragma omp target + #pragma omp teams reduction(+: e) + { + e += 5; + } + + #pragma omp target + #pragma omp teams reduction(^: c) reduction(*: d) + { + c ^= 2; + d *= 33; + } + + #pragma omp target + #pragma omp teams reduction(|: a) reduction(max: b) + { + a |= 1; + b = 99 > b ? 99 : b; + } + + return a+b+c+d+e; +} + +int bar(int n){ + int a = 0; + + a += ftemplate<char>(n); + + return a; +} + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l27}}_worker() + + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l27]]( + // + // CHECK: {{call|invoke}} void [[T1]]_worker() + // + // CHECK: call void @__kmpc_kernel_init( + // + // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align + // CHECK: [[EV:%.+]] = load double, double* [[E]], align + // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 + // CHECK: store double [[ADD]], double* [[E]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i[[SZ:32|64]] 0, i{{32|64}} 0 + // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8* + // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 1, i[[SZ]] {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] + // + // CHECK: [[IFLABEL]] + // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align + // CHECK: [[EV:%.+]] = load double, double* [[E]], align + // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] + // CHECK: store double [[ADD]], double* [[E_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: call void @__kmpc_kernel_deinit() + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], + // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], + // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], + // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], + // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] + // CHECK: store double [[RES]], double* [[VAR_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT:%.+]] = alloca double + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = bitcast double [[ELT_VAL]] to i64 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT_VAL:%.+]] = bitcast i64 [[REMOTE_ELT_VAL64]] to double + // + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align + // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* + // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + // + // Copy to scratchpad function + // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 8, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // CHECK: store double [[ELT_VAL]], double* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: ret + + // + // Load and reduce function + // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT:%.+]] = alloca double + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 8, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align + // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* + // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1 + // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // Copy element from remote reduce list + // CHECK: [[REDUCE_ELSE]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // CHECK: ret + + + + + + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}_worker() + + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l33]]( + // + // CHECK: {{call|invoke}} void [[T2]]_worker() + // + // CHECK: call void @__kmpc_kernel_init( + // + // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align + // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align + // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} + // CHECK: store float [[MUL]], float* [[D]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: store i8* [[C]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8* + // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] + // + // CHECK: [[IFLABEL]] + // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align + // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 + // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align + // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align + // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] + // CHECK: store float [[MUL]], float* [[D_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: call void @__kmpc_kernel_deinit() + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* + // + // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], + // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 + // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], + // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: store float [[RES]], float* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca float + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 + // + // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align + // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = bitcast float [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = bitcast i32 [[REMOTE_ELT2_VAL32]] to float + // + // CHECK: store float [[REMOTE_ELT2_VAL]], float* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + // + // Copy to scratchpad function + // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 1, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // CHECK: store i8 [[ELT_VAL]], i8* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 1 + // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256 + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // CHECK: store float [[ELT_VAL]], float* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: ret + + // + // Load and reduce function + // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca float + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 1, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[SCRATCHPAD_ELT_PTR_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[REMOTE_ELT1]], align + // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 1 + // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256 + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1 + // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // Copy element from remote reduce list + // CHECK: [[REDUCE_ELSE]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // CHECK: ret + + + + + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l40}}_worker() + + // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+template.+l40]]( + // + // CHECK: {{call|invoke}} void [[T3]]_worker() + // + // CHECK: call void @__kmpc_kernel_init( + // + // CHECK: store i32 0, i32* [[A:%.+]], align + // CHECK: store i16 -32768, i16* [[B:%.+]], align + // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align + // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 + // CHECK: store i32 [[OR]], i32* [[A]], align + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align + // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 + // CHECK: store i16 [[TRUNC]], i16* [[B]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* + // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* + // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] + // + // CHECK: [[IFLABEL]] + // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align + // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align + // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] + // CHECK: store i32 [[OR]], i32* [[A_IN]], align + // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align + // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: call void @__kmpc_kernel_deinit() + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* + // + // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], + // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], + // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 + // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 + // + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) + // + // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align + // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* + // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 + // + // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + // + // Copy to scratchpad function + // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // CHECK: store i32 [[ELT_VAL]], i32* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 4 + // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256 + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 2, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // CHECK: store i16 [[ELT_VAL]], i16* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: ret + + // + // Load and reduce function + // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[REMOTE_ELT1]], align + // CHECK: [[REMOTE_ELT1_PTR:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* + // CHECK: store i8* [[REMOTE_ELT1_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 4 + // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256 + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 2, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1 + // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // Copy element from remote reduce list + // CHECK: [[REDUCE_ELSE]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // CHECK: ret + + +#endif diff --git a/test/OpenMP/openmp_check.cpp b/test/OpenMP/openmp_check.cpp index c9b5eb0b9cba..d46213a90e9f 100644 --- a/test/OpenMP/openmp_check.cpp +++ b/test/OpenMP/openmp_check.cpp @@ -1,15 +1,35 @@ // RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s +// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++98 %s +// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++11 %s + int nested(int a) { #pragma omp parallel ++a; - auto F = [&]() { // expected-error {{expected expression}} expected-error {{expected ';' at end of declaration}} expected-warning {{'auto' type specifier is a C++11 extension}} + auto F = [&]() { +#if __cplusplus <= 199711L + // expected-warning@-2 {{'auto' type specifier is a C++11 extension}} + // expected-error@-3 {{expected expression}} + // expected-error@-4 {{expected ';' at end of declaration}} +#else + // expected-no-diagnostics +#endif + #pragma omp parallel { #pragma omp target ++a; } }; - F(); // expected-error {{C++ requires a type specifier for all declarations}} - return a; // expected-error {{expected unqualified-id}} -}// expected-error {{extraneous closing brace ('}')}} + F(); +#if __cplusplus <= 199711L + // expected-error@-2 {{C++ requires a type specifier for all declarations}} +#endif + return a; +#if __cplusplus <= 199711L + // expected-error@-2 {{expected unqualified-id}} +#endif +} +#if __cplusplus <= 199711L +// expected-error@-2 {{extraneous closing brace ('}')}} +#endif diff --git a/test/OpenMP/ordered_messages.cpp b/test/OpenMP/ordered_messages.cpp index 36f9bb2c7777..eb2c18e4690c 100644 --- a/test/OpenMP/ordered_messages.cpp +++ b/test/OpenMP/ordered_messages.cpp @@ -1,4 +1,6 @@ // RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s +// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++98 -o - %s +// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++11 -o - %s int foo(); @@ -123,6 +125,9 @@ T foo() { #pragma omp ordered depend(sink : j, i) // expected-error {{expected 'i' loop iteration variable}} expected-error {{expected 'j' loop iteration variable}} #pragma omp ordered depend(sink : i, j, k) // expected-error {{unexpected expression: number of expressions is larger than the number of associated loops}} #pragma omp ordered depend(sink : i+foo(), j/4) // expected-error {{expression is not an integral constant expression}} expected-error {{expected '+' or '-' operation}} +#if __cplusplus >= 201103L +// expected-note@-2 {{non-constexpr function 'foo' cannot be used in a constant expression}} +#endif #pragma omp ordered depend(sink : i*0, j-4)// expected-error {{expected '+' or '-' operation}} #pragma omp ordered depend(sink : i-0, j+sizeof(T)) depend(sink : i-0, j+sizeof(T)) #pragma omp ordered depend(sink : i-0, j+sizeof(T)) depend(source) // expected-error {{'depend(source)' clause cannot be mixed with 'depend(sink:vec)' clauses}} @@ -133,6 +138,9 @@ T foo() { } int foo() { +#if __cplusplus >= 201103L +// expected-note@-2 2 {{declared here}} +#endif int k; #pragma omp for ordered for (int i = 0; i < 10; ++i) { @@ -252,6 +260,9 @@ int k; #pragma omp ordered depend(sink : j, i) // expected-error {{expected 'i' loop iteration variable}} expected-error {{expected 'j' loop iteration variable}} #pragma omp ordered depend(sink : i, j, k) // expected-error {{unexpected expression: number of expressions is larger than the number of associated loops}} #pragma omp ordered depend(sink : i+foo(), j/4) // expected-error {{expression is not an integral constant expression}} expected-error {{expected '+' or '-' operation}} +#if __cplusplus >= 201103L +// expected-note@-2 {{non-constexpr function 'foo' cannot be used in a constant expression}} +#endif #pragma omp ordered depend(sink : i*0, j-4)// expected-error {{expected '+' or '-' operation}} #pragma omp ordered depend(sink : i-0, j+sizeof(int)) depend(sink : i-0, j+sizeof(int)) #pragma omp ordered depend(sink : i-0, j+sizeof(int)) depend(source) // expected-error {{'depend(source)' clause cannot be mixed with 'depend(sink:vec)' clauses}} diff --git a/test/OpenMP/target_parallel_codegen.cpp b/test/OpenMP/target_parallel_codegen.cpp new file mode 100644 index 000000000000..c7acb27cab74 --- /dev/null +++ b/test/OpenMP/target_parallel_codegen.cpp @@ -0,0 +1,802 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK-DAG: %ident_t = type { i32, i32, i32, i32, i8* } +// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00" +// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) } + +// CHECK-DAG: [[TT:%.+]] = type { i64, i8 } +// CHECK-DAG: [[S1:%.+]] = type { double } +// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 } +// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* } +// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* } + +// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 } + +// We have 8 target regions, but only 7 that actually will generate offloading +// code, only 6 will have mapped arguments, and only 4 have all-constant map +// sizes. + +// CHECK-DAG: [[SIZET2:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] [i[[SZ:32|64]] 2] +// CHECK-DAG: [[MAPT2:@.+]] = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: [[SIZET3:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2] +// CHECK-DAG: [[MAPT3:@.+]] = private unnamed_addr constant [2 x i32] [i32 288, i32 288] +// CHECK-DAG: [[MAPT4:@.+]] = private unnamed_addr constant [9 x i32] [i32 288, i32 35, i32 288, i32 35, i32 35, i32 288, i32 288, i32 35, i32 35] +// CHECK-DAG: [[SIZET5:@.+]] = private unnamed_addr constant [3 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 40] +// CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i32] [i32 288, i32 288, i32 35] +// CHECK-DAG: [[SIZET6:@.+]] = private unnamed_addr constant [4 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 1, i[[SZ]] 40] +// CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i32] [i32 288, i32 288, i32 288, i32 35] +// CHECK-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i32] [i32 35, i32 288, i32 288, i32 288, i32 35] +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 + +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK-NOT: @{{.+}} = constant [[ENTTY]] + +// Check if offloading descriptor is created. +// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]] +// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]] +// CHECK: [[DEVBEGIN:@.+]] = external constant i8 +// CHECK: [[DEVEND:@.+]] = external constant i8 +// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }] +// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] } + +// Check target registration is registered as a Ctor. +// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* [[REGFN:@.+]] to void ()*), i8* null }] + + +template<typename tx, typename ty> +struct TT{ + tx X; + ty Y; +}; + +// CHECK: define {{.*}}[[FOO:@.+]]( +int foo(int n) { + int a = 0; + short aa = 0; + float b[10]; + float bn[n]; + double c[5][10]; + double cn[5][n]; + TT<long long, char> d; + + // CHECK: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i32* null, i32 1, i32 0) + // CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 + // CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT0:@.+]]() + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target parallel + { + } + + // CHECK: store i32 0, i32* [[RHV:%.+]], align 4 + // CHECK: store i32 -1, i32* [[RHV]], align 4 + // CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 + // CHECK: call void [[HVT1:@.+]](i[[SZ]] {{[^,]+}}) + #pragma omp target parallel if(target: 0) + { + a += 1; + } + + // CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, i8** [[BP:%[^,]+]], i8** [[P:%[^,]+]], i[[SZ]]* getelementptr inbounds ([1 x i[[SZ]]], [1 x i[[SZ]]]* [[SIZET2]], i32 0, i32 0), i32* getelementptr inbounds ([1 x i32], [1 x i32]* [[MAPT2]], i32 0, i32 0), i32 1, i32 0) + // CHECK-DAG: [[BP]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BPR:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[P]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PR:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BPR]], i32 0, i32 [[IDX0:[0-9]+]] + // CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PR]], i32 0, i32 [[IDX0]] + // CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] + // CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] + // CHECK-DAG: [[BP0]] = inttoptr i[[SZ]] %{{.+}} to i8* + // CHECK-DAG: [[P0]] = inttoptr i[[SZ]] %{{.+}} to i8* + + // CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 + // CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT2:@.+]](i[[SZ]] {{[^,]+}}) + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target parallel if(target: 1) + { + aa += 1; + } + + // CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 10 + // CHECK: br i1 [[IF]], label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]] + // CHECK: [[IFTHEN]] + // CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* getelementptr inbounds ([2 x i[[SZ]]], [2 x i[[SZ]]]* [[SIZET3]], i32 0, i32 0), i32* getelementptr inbounds ([2 x i32], [2 x i32]* [[MAPT3]], i32 0, i32 0), i32 1, i32 0) + // CHECK-DAG: [[BPR]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[PR]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%[^,]+]], i32 0, i32 0 + + // CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0 + // CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0 + // CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] + // CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] + // CHECK-DAG: [[BP0]] = inttoptr i[[SZ]] %{{.+}} to i8* + // CHECK-DAG: [[P0]] = inttoptr i[[SZ]] %{{.+}} to i8* + + // CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1 + // CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1 + // CHECK-DAG: store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]] + // CHECK-DAG: store i8* [[P1:%[^,]+]], i8** [[PADDR1]] + // CHECK-DAG: [[BP1]] = inttoptr i[[SZ]] %{{.+}} to i8* + // CHECK-DAG: [[P1]] = inttoptr i[[SZ]] %{{.+}} to i8* + // CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 + // CHECK-NEXT: br label %[[IFEND:.+]] + + // CHECK: [[IFELSE]] + // CHECK: store i32 -1, i32* [[RHV]], align 4 + // CHECK-NEXT: br label %[[IFEND:.+]] + + // CHECK: [[IFEND]] + // CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 + // CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT3:@.+]]({{[^,]+}}, {{[^,]+}}) + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target parallel if(target: n>10) + { + a += 1; + aa += 1; + } + + // We capture 3 VLA sizes in this target region + // CHECK-64: [[A_VAL:%.+]] = load i32, i32* %{{.+}}, + // CHECK-64: [[A_ADDR:%.+]] = bitcast i[[SZ]]* [[A_CADDR:%.+]] to i32* + // CHECK-64: store i32 [[A_VAL]], i32* [[A_ADDR]], + // CHECK-64: [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]], + + // CHECK-32: [[A_VAL:%.+]] = load i32, i32* %{{.+}}, + // CHECK-32: store i32 [[A_VAL]], i32* [[A_CADDR:%.+]], + // CHECK-32: [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]], + + // CHECK: [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[VLA0:%.+]], 4 + // CHECK: [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[VLA1:%.+]] + // CHECK: [[CNSIZE:%.+]] = mul nuw i[[SZ]] [[CNELEMSIZE2]], 8 + + // CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20 + // CHECK: br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]] + // CHECK: [[TRY]] + // CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 9, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i32* getelementptr inbounds ([9 x i32], [9 x i32]* [[MAPT4]], i32 0, i32 0), i32 1, i32 0) + // CHECK-DAG: [[BPR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[PR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[SR]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S:%[^,]+]], i32 0, i32 0 + + // CHECK-DAG: [[SADDR0:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX0:[0-9]+]] + // CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX0]] + // CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX0]] + // CHECK-DAG: [[SADDR1:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX1:[0-9]+]] + // CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX1]] + // CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX1]] + // CHECK-DAG: [[SADDR2:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX2:[0-9]+]] + // CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX2]] + // CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX2]] + // CHECK-DAG: [[SADDR3:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX3:[0-9]+]] + // CHECK-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX3]] + // CHECK-DAG: [[PADDR3:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX3]] + // CHECK-DAG: [[SADDR4:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX4:[0-9]+]] + // CHECK-DAG: [[BPADDR4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX4]] + // CHECK-DAG: [[PADDR4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX4]] + // CHECK-DAG: [[SADDR5:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX5:[0-9]+]] + // CHECK-DAG: [[BPADDR5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX5]] + // CHECK-DAG: [[PADDR5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX5]] + // CHECK-DAG: [[SADDR6:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX6:[0-9]+]] + // CHECK-DAG: [[BPADDR6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX6]] + // CHECK-DAG: [[PADDR6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX6]] + // CHECK-DAG: [[SADDR7:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX7:[0-9]+]] + // CHECK-DAG: [[BPADDR7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX7]] + // CHECK-DAG: [[PADDR7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX7]] + // CHECK-DAG: [[SADDR8:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX8:[0-9]+]] + // CHECK-DAG: [[BPADDR8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX8]] + // CHECK-DAG: [[PADDR8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX8]] + + // The names below are not necessarily consistent with the names used for the + // addresses above as some are repeated. + // CHECK-DAG: [[BP0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8* + // CHECK-DAG: [[P0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8* + // CHECK-DAG: store i8* [[BP0]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P0]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP1:%[^,]+]] = inttoptr i[[SZ]] [[VLA1]] to i8* + // CHECK-DAG: [[P1:%[^,]+]] = inttoptr i[[SZ]] [[VLA1]] to i8* + // CHECK-DAG: store i8* [[BP1]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P1]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: store i8* inttoptr (i[[SZ]] 5 to i8*), i8** {{%[^,]+}} + // CHECK-DAG: store i8* inttoptr (i[[SZ]] 5 to i8*), i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP3:%[^,]+]] = inttoptr i[[SZ]] [[A_CVAL]] to i8* + // CHECK-DAG: [[P3:%[^,]+]] = inttoptr i[[SZ]] [[A_CVAL]] to i8* + // CHECK-DAG: store i8* [[BP3]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P3]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] 4, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP4:%[^,]+]] = bitcast [10 x float]* %{{.+}} to i8* + // CHECK-DAG: [[P4:%[^,]+]] = bitcast [10 x float]* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP4]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P4]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] 40, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP5:%[^,]+]] = bitcast float* %{{.+}} to i8* + // CHECK-DAG: [[P5:%[^,]+]] = bitcast float* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP5]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P5]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] [[BNSIZE]], i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP6:%[^,]+]] = bitcast [5 x [10 x double]]* %{{.+}} to i8* + // CHECK-DAG: [[P6:%[^,]+]] = bitcast [5 x [10 x double]]* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP6]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P6]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] 400, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP7:%[^,]+]] = bitcast double* %{{.+}} to i8* + // CHECK-DAG: [[P7:%[^,]+]] = bitcast double* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP7]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P7]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] [[CNSIZE]], i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP8:%[^,]+]] = bitcast [[TT]]* %{{.+}} to i8* + // CHECK-DAG: [[P8:%[^,]+]] = bitcast [[TT]]* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP8]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P8]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{12|16}}, i[[SZ]]* {{%[^,]+}} + + // CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 + // CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + + // CHECK: [[FAIL]] + // CHECK: call void [[HVT4:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}) + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target parallel if(target: n>20) + { + a += 1; + b[2] += 1.0; + bn[3] += 1.0; + c[1][2] += 1.0; + cn[1][3] += 1.0; + d.X += 1; + d.Y += 1; + } + + return a; +} + +// Check that the offloading functions are emitted and that the arguments are +// correct and loaded correctly for the target regions in foo(). + +// CHECK: define internal void [[HVT0]]() +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*)) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED]](i32* noalias %.global_tid., i32* noalias %.bound_tid.) +// CHECK: ret void +// CHECK-NEXT: } + + +// CHECK: define internal void [[HVT1]](i[[SZ]] %{{.+}}) +// Create stack storage and store argument in there. +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: [[AA_CASTED:%.+]] = alloca i[[SZ]], align +// CHECK: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK-64: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32* +// CHECK-64: [[AA:%.+]] = load i32, i32* [[AA_CADDR]], align +// CHECK-32: [[AA:%.+]] = load i32, i32* [[AA_ADDR]], align +// CHECK-64: [[AA_C:%.+]] = bitcast i[[SZ]]* [[AA_CASTED]] to i32* +// CHECK-64: store i32 [[AA]], i32* [[AA_C]], align +// CHECK-32: store i32 [[AA]], i32* [[AA_CASTED]], align +// CHECK: [[PARAM:%.+]] = load i[[SZ]], i[[SZ]]* [[AA_CASTED]], align +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]])* [[OMP_OUTLINED1:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[PARAM]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}) +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK-64: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32* +// CHECK-64: [[AA:%.+]] = load i32, i32* [[AA_CADDR]], align +// CHECK-32: [[AA:%.+]] = load i32, i32* [[AA_ADDR]], align +// CHECK: ret void +// CHECK-NEXT: } + +// CHECK: define internal void [[HVT2]](i[[SZ]] %{{.+}}) +// Create stack storage and store argument in there. +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: [[AA_CASTED:%.+]] = alloca i[[SZ]], align +// CHECK: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* +// CHECK: [[AA:%.+]] = load i16, i16* [[AA_CADDR]], align +// CHECK: [[AA_C:%.+]] = bitcast i[[SZ]]* [[AA_CASTED]] to i16* +// CHECK: store i16 [[AA]], i16* [[AA_C]], align +// CHECK: [[PARAM:%.+]] = load i[[SZ]], i[[SZ]]* [[AA_CASTED]], align +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]])* [[OMP_OUTLINED2:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[PARAM]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}) +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* +// CHECK: [[AA:%.+]] = load i16, i16* [[AA_CADDR]], align +// CHECK: ret void +// CHECK-NEXT: } + +// CHECK: define internal void [[HVT3]] +// Create stack storage and store argument in there. +// CHECK: [[A_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: [[A_CASTED:%.+]] = alloca i[[SZ]], align +// CHECK: [[AA_CASTED:%.+]] = alloca i[[SZ]], align +// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[A_ADDR]], align +// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK-64-DAG:[[A_CADDR:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i32* +// CHECK-DAG: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* +// CHECK-64-DAG:[[A:%.+]] = load i32, i32* [[A_CADDR]], align +// CHECK-32-DAG:[[A:%.+]] = load i32, i32* [[A_ADDR]], align +// CHECK-64-DAG:[[A_C:%.+]] = bitcast i[[SZ]]* [[A_CASTED]] to i32* +// CHECK-64-DAG:store i32 [[A]], i32* [[A_C]], align +// CHECK-32-DAG:store i32 [[A]], i32* [[A_CASTED]], align +// CHECK-DAG: [[AA:%.+]] = load i16, i16* [[AA_CADDR]], align +// CHECK-DAG: [[AA_C:%.+]] = bitcast i[[SZ]]* [[AA_CASTED]] to i16* +// CHECK-DAG: store i16 [[AA]], i16* [[AA_C]], align +// CHECK-DAG: [[PARAM1:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CASTED]], align +// CHECK-DAG: [[PARAM2:%.+]] = load i[[SZ]], i[[SZ]]* [[AA_CASTED]], align +// CHECK-DAG: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]], i[[SZ]])* [[OMP_OUTLINED3:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[PARAM1]], i[[SZ]] [[PARAM2]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED3]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}) +// CHECK: [[A_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[A_ADDR]], align +// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK-64-DAG:[[A_CADDR:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i32* +// CHECK-DAG: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* +// CHECK: ret void +// CHECK-NEXT: } + +// CHECK: define internal void [[HVT4]] +// Create local storage for each capture. +// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_B:%.+]] = alloca [10 x float]* +// CHECK: [[LOCAL_VLA1:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_BN:%.+]] = alloca float* +// CHECK: [[LOCAL_C:%.+]] = alloca [5 x [10 x double]]* +// CHECK: [[LOCAL_VLA2:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_VLA3:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_CN:%.+]] = alloca double* +// CHECK: [[LOCAL_D:%.+]] = alloca [[TT]]* +// CHECK: [[LOCAL_A_CASTED:%.+]] = alloca i[[SZ]] +// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] +// CHECK-DAG: store [10 x float]* [[ARG_B:%.+]], [10 x float]** [[LOCAL_B]] +// CHECK-DAG: store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]] +// CHECK-DAG: store float* [[ARG_BN:%.+]], float** [[LOCAL_BN]] +// CHECK-DAG: store [5 x [10 x double]]* [[ARG_C:%.+]], [5 x [10 x double]]** [[LOCAL_C]] +// CHECK-DAG: store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]] +// CHECK-DAG: store i[[SZ]] [[ARG_VLA3:%.+]], i[[SZ]]* [[LOCAL_VLA3]] +// CHECK-DAG: store double* [[ARG_CN:%.+]], double** [[LOCAL_CN]] +// CHECK-DAG: store [[TT]]* [[ARG_D:%.+]], [[TT]]** [[LOCAL_D]] + +// CHECK-64-DAG:[[CONV_AP:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* +// CHECK-DAG: [[REF_B:%.+]] = load [10 x float]*, [10 x float]** [[LOCAL_B]], +// CHECK-DAG: [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]], +// CHECK-DAG: [[REF_BN:%.+]] = load float*, float** [[LOCAL_BN]], +// CHECK-DAG: [[REF_C:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[LOCAL_C]], +// CHECK-DAG: [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]], +// CHECK-DAG: [[VAL_VLA3:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA3]], +// CHECK-DAG: [[REF_CN:%.+]] = load double*, double** [[LOCAL_CN]], +// CHECK-DAG: [[REF_D:%.+]] = load [[TT]]*, [[TT]]** [[LOCAL_D]], + +// CHECK-64-DAG:[[CONV_A:%.+]] = load i32, i32* [[CONV_AP]] +// CHECK-64-DAG:[[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_A_CASTED]] to i32* +// CHECK-64-DAG:store i32 [[CONV_A]], i32* [[CONV]], align +// CHECK-32-DAG:[[LOCAL_AV:%.+]] = load i32, i32* [[LOCAL_A]] +// CHECK-32-DAG:store i32 [[LOCAL_AV]], i32* [[LOCAL_A_CASTED]], align +// CHECK-DAG: [[REF_A:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_A_CASTED]], + +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 9, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]], [10 x float]*, i[[SZ]], float*, [5 x [10 x double]]*, i[[SZ]], i[[SZ]], double*, [[TT]]*)* [[OMP_OUTLINED4:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[REF_A]], [10 x float]* [[REF_B]], i[[SZ]] [[VAL_VLA1]], float* [[REF_BN]], [5 x [10 x double]]* [[REF_C]], i[[SZ]] [[VAL_VLA2]], i[[SZ]] [[VAL_VLA3]], double* [[REF_CN]], [[TT]]* [[REF_D]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED4]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}, [10 x float]* {{.+}}, i[[SZ]] %{{.+}}, float* %{{.+}}, [5 x [10 x double]]* {{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, double* %{{.+}}, [[TT]]* {{.+}}) +// To reduce complexity, we're only going as far as validating the signature of the outlined parallel function. + +template<typename tx> +tx ftemplate(int n) { + tx a = 0; + short aa = 0; + tx b[10]; + + #pragma omp target parallel if(target: n>40) + { + a += 1; + aa += 1; + b[2] += 1; + } + + return a; +} + +static +int fstatic(int n) { + int a = 0; + short aa = 0; + char aaa = 0; + int b[10]; + + #pragma omp target parallel if(target: n>50) + { + a += 1; + aa += 1; + aaa += 1; + b[2] += 1; + } + + return a; +} + +struct S1 { + double a; + + int r1(int n){ + int b = n+1; + short int c[2][n]; + + #pragma omp target parallel if(target: n>60) + { + this->a = (double)b + 1.5; + c[1][1] = ++a; + } + + return c[1][1] + (int)b; + } +}; + +// CHECK: define {{.*}}@{{.*}}bar{{.*}} +int bar(int n){ + int a = 0; + + // CHECK: call {{.*}}i32 [[FOO]](i32 {{.*}}) + a += foo(n); + + S1 S; + // CHECK: call {{.*}}i32 [[FS1:@.+]]([[S1]]* {{.*}}, i32 {{.*}}) + a += S.r1(n); + + // CHECK: call {{.*}}i32 [[FSTATIC:@.+]](i32 {{.*}}) + a += fstatic(n); + + // CHECK: call {{.*}}i32 [[FTEMPLATE:@.+]](i32 {{.*}}) + a += ftemplate<int>(n); + + return a; +} + +// +// CHECK: define {{.*}}[[FS1]] +// +// CHECK: i8* @llvm.stacksave() +// CHECK-64: [[B_ADDR:%.+]] = bitcast i[[SZ]]* [[B_CADDR:%.+]] to i32* +// CHECK-64: store i32 %{{.+}}, i32* [[B_ADDR]], +// CHECK-64: [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_CADDR]], + +// CHECK-32: store i32 %{{.+}}, i32* [[B_ADDR:%.+]], +// CHECK-32: [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_ADDR]], + +// We capture 2 VLA sizes in this target region +// CHECK: [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[VLA0:%.+]] +// CHECK: [[CSIZE:%.+]] = mul nuw i[[SZ]] [[CELEMSIZE2]], 2 + +// CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60 +// CHECK: br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]] +// CHECK: [[TRY]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i32* getelementptr inbounds ([5 x i32], [5 x i32]* [[MAPT7]], i32 0, i32 0), i32 1, i32 0) +// CHECK-DAG: [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0 +// CHECK-DAG: [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0 +// CHECK-DAG: [[SR]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S:%.+]], i32 0, i32 0 +// CHECK-DAG: [[SADDR0:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX0:[0-9]+]] +// CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX0]] +// CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX0]] +// CHECK-DAG: [[SADDR1:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX1:[0-9]+]] +// CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX1]] +// CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX1]] +// CHECK-DAG: [[SADDR2:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX2:[0-9]+]] +// CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX2]] +// CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX2]] +// CHECK-DAG: [[SADDR3:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX3:[0-9]+]] +// CHECK-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX3]] +// CHECK-DAG: [[PADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX3]] + +// The names below are not necessarily consistent with the names used for the +// addresses above as some are repeated. +// CHECK-DAG: [[BP0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8* +// CHECK-DAG: [[P0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8* +// CHECK-DAG: store i8* [[BP0]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P0]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: store i8* inttoptr (i[[SZ]] 2 to i8*), i8** {{%[^,]+}} +// CHECK-DAG: store i8* inttoptr (i[[SZ]] 2 to i8*), i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: [[BP2:%[^,]+]] = inttoptr i[[SZ]] [[B_CVAL]] to i8* +// CHECK-DAG: [[P2:%[^,]+]] = inttoptr i[[SZ]] [[B_CVAL]] to i8* +// CHECK-DAG: store i8* [[BP2]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P2]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] 4, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: [[BP3:%[^,]+]] = bitcast [[S1]]* %{{.+}} to i8* +// CHECK-DAG: [[P3:%[^,]+]] = bitcast [[S1]]* %{{.+}} to i8* +// CHECK-DAG: store i8* [[BP3]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P3]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] 8, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: [[BP4:%[^,]+]] = bitcast i16* %{{.+}} to i8* +// CHECK-DAG: [[P4:%[^,]+]] = bitcast i16* %{{.+}} to i8* +// CHECK-DAG: store i8* [[BP4]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P4]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] [[CSIZE]], i[[SZ]]* {{%[^,]+}} + +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 +// CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + +// CHECK: [[FAIL]] +// CHECK: call void [[HVT7:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}) +// CHECK-NEXT: br label %[[END]] +// CHECK: [[END]] + +// +// CHECK: define {{.*}}[[FSTATIC]] +// +// CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 50 +// CHECK: br i1 [[IF]], label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]] +// CHECK: [[IFTHEN]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 4, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* getelementptr inbounds ([4 x i[[SZ]]], [4 x i[[SZ]]]* [[SIZET6]], i32 0, i32 0), i32* getelementptr inbounds ([4 x i32], [4 x i32]* [[MAPT6]], i32 0, i32 0), i32 1, i32 0) +// CHECK-DAG: [[BPR]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP:%.+]], i32 0, i32 0 +// CHECK-DAG: [[PR]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P:%.+]], i32 0, i32 0 + +// CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 0 +// CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 0 +// CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] +// CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] +// CHECK-DAG: [[BP0]] = inttoptr i[[SZ]] [[VAL0:%.+]] to i8* +// CHECK-DAG: [[P0]] = inttoptr i[[SZ]] [[VAL0]] to i8* + +// CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 1 +// CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 1 +// CHECK-DAG: store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]] +// CHECK-DAG: store i8* [[P1:%[^,]+]], i8** [[PADDR1]] +// CHECK-DAG: [[BP1]] = inttoptr i[[SZ]] [[VAL1:%.+]] to i8* +// CHECK-DAG: [[P1]] = inttoptr i[[SZ]] [[VAL1]] to i8* + +// CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 2 +// CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 2 +// CHECK-DAG: store i8* [[BP2:%[^,]+]], i8** [[BPADDR2]] +// CHECK-DAG: store i8* [[P2:%[^,]+]], i8** [[PADDR2]] + +// CHECK-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 3 +// CHECK-DAG: [[PADDR3:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 3 +// CHECK-DAG: store i8* [[BP3:%[^,]+]], i8** [[BPADDR3]] +// CHECK-DAG: store i8* [[P3:%[^,]+]], i8** [[PADDR3]] +// CHECK-DAG: [[BP3]] = bitcast [10 x i32]* %{{.+}} to i8* +// CHECK-DAG: [[P3]] = bitcast [10 x i32]* %{{.+}} to i8* + +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 +// CHECK-NEXT: br label %[[IFEND:.+]] + +// CHECK: [[IFELSE]] +// CHECK: store i32 -1, i32* [[RHV]], align 4 +// CHECK-NEXT: br label %[[IFEND:.+]] + +// CHECK: [[IFEND]] +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// CHECK: [[FAIL]] +// CHECK: call void [[HVT6:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}) +// CHECK-NEXT: br label %[[END]] +// CHECK: [[END]] + +// +// CHECK: define {{.*}}[[FTEMPLATE]] +// +// CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 40 +// CHECK: br i1 [[IF]], label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]] +// CHECK: [[IFTHEN]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* getelementptr inbounds ([3 x i[[SZ]]], [3 x i[[SZ]]]* [[SIZET5]], i32 0, i32 0), i32* getelementptr inbounds ([3 x i32], [3 x i32]* [[MAPT5]], i32 0, i32 0), i32 1, i32 0) +// CHECK-DAG: [[BPR]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP:%.+]], i32 0, i32 0 +// CHECK-DAG: [[PR]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P:%.+]], i32 0, i32 0 + +// CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 0 +// CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 0 +// CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] +// CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] +// CHECK-DAG: [[BP0]] = inttoptr i[[SZ]] [[VAL0:%.+]] to i8* +// CHECK-DAG: [[P0]] = inttoptr i[[SZ]] [[VAL0]] to i8* + +// CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 1 +// CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 1 +// CHECK-DAG: store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]] +// CHECK-DAG: store i8* [[P1:%[^,]+]], i8** [[PADDR1]] +// CHECK-DAG: [[BP1]] = inttoptr i[[SZ]] [[VAL1:%.+]] to i8* +// CHECK-DAG: [[P1]] = inttoptr i[[SZ]] [[VAL1]] to i8* + +// CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 2 +// CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 2 +// CHECK-DAG: store i8* [[BP2:%[^,]+]], i8** [[BPADDR2]] +// CHECK-DAG: store i8* [[P2:%[^,]+]], i8** [[PADDR2]] +// CHECK-DAG: [[BP2]] = bitcast [10 x i32]* %{{.+}} to i8* +// CHECK-DAG: [[P2]] = bitcast [10 x i32]* %{{.+}} to i8* + +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 +// CHECK-NEXT: br label %[[IFEND:.+]] + +// CHECK: [[IFELSE]] +// CHECK: store i32 -1, i32* [[RHV]], align 4 +// CHECK-NEXT: br label %[[IFEND:.+]] + +// CHECK: [[IFEND]] +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// CHECK: [[FAIL]] +// CHECK: call void [[HVT5:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}) +// CHECK-NEXT: br label %[[END]] +// CHECK: [[END]] + + + +// Check that the offloading functions are emitted and that the arguments are +// correct and loaded correctly for the target regions of the callees of bar(). + +// CHECK: define internal void [[HVT7]] +// Create local storage for each capture. +// CHECK: [[LOCAL_THIS:%.+]] = alloca [[S1]]* +// CHECK: [[LOCAL_B:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_VLA1:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_VLA2:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_C:%.+]] = alloca i16* +// CHECK: [[LOCAL_B_CASTED:%.+]] = alloca i[[SZ]] +// CHECK-DAG: store [[S1]]* [[ARG_THIS:%.+]], [[S1]]** [[LOCAL_THIS]] +// CHECK-DAG: store i[[SZ]] [[ARG_B:%.+]], i[[SZ]]* [[LOCAL_B]] +// CHECK-DAG: store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]] +// CHECK-DAG: store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]] +// CHECK-DAG: store i16* [[ARG_C:%.+]], i16** [[LOCAL_C]] +// Store captures in the context. +// CHECK-DAG: [[REF_THIS:%.+]] = load [[S1]]*, [[S1]]** [[LOCAL_THIS]], +// CHECK-64-DAG:[[CONV_BP:%.+]] = bitcast i[[SZ]]* [[LOCAL_B]] to i32* +// CHECK-DAG: [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]], +// CHECK-DAG: [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]], +// CHECK-DAG: [[REF_C:%.+]] = load i16*, i16** [[LOCAL_C]], + +// CHECK-64-DAG:[[CONV_B:%.+]] = load i32, i32* [[CONV_BP]] +// CHECK-64-DAG:[[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_B_CASTED]] to i32* +// CHECK-64-DAG:store i32 [[CONV_B]], i32* [[CONV]], align +// CHECK-32-DAG:[[LOCAL_BV:%.+]] = load i32, i32* [[LOCAL_B]] +// CHECK-32-DAG:store i32 [[LOCAL_BV]], i32* [[LOCAL_B_CASTED]], align +// CHECK-DAG: [[REF_B:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_B_CASTED]], + +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [[S1]]*, i[[SZ]], i[[SZ]], i[[SZ]], i16*)* [[OMP_OUTLINED5:@.+]] to void (i32*, i32*, ...)*), [[S1]]* [[REF_THIS]], i[[SZ]] [[REF_B]], i[[SZ]] [[VAL_VLA1]], i[[SZ]] [[VAL_VLA2]], i16* [[REF_C]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED5]](i32* noalias %.global_tid., i32* noalias %.bound_tid., [[S1]]* %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i16* %{{.+}}) +// To reduce complexity, we're only going as far as validating the signature of the outlined parallel function. + + +// CHECK: define internal void [[HVT6]] +// Create local storage for each capture. +// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AAA:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]* +// CHECK: [[LOCAL_A_CASTED:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AA_CASTED:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AAA_CASTED:%.+]] = alloca i[[SZ]] +// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] +// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]] +// CHECK-DAG: store i[[SZ]] [[ARG_AAA:%.+]], i[[SZ]]* [[LOCAL_AAA]] +// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] +// Store captures in the context. +// CHECK-64-DAG:[[CONV_AP:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* +// CHECK-DAG: [[CONV_AAP:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16* +// CHECK-DAG: [[CONV_AAAP:%.+]] = bitcast i[[SZ]]* [[LOCAL_AAA]] to i8* +// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], + +// CHECK-64-DAG:[[CONV_A:%.+]] = load i32, i32* [[CONV_AP]] +// CHECK-64-DAG:[[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_A_CASTED]] to i32* +// CHECK-64-DAG:store i32 [[CONV_A]], i32* [[CONV]], align +// CHECK-32-DAG:[[LOCAL_AV:%.+]] = load i32, i32* [[LOCAL_A]] +// CHECK-32-DAG:store i32 [[LOCAL_AV]], i32* [[LOCAL_A_CASTED]], align +// CHECK-DAG: [[REF_A:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_A_CASTED]], + +// CHECK-DAG: [[CONV_AA:%.+]] = load i16, i16* [[CONV_AAP]] +// CHECK-DAG: [[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA_CASTED]] to i16* +// CHECK-DAG: store i16 [[CONV_AA]], i16* [[CONV]], align +// CHECK-DAG: [[REF_AA:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_AA_CASTED]], + +// CHECK-DAG: [[CONV_AAA:%.+]] = load i8, i8* [[CONV_AAAP]] +// CHECK-DAG: [[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_AAA_CASTED]] to i8* +// CHECK-DAG: store i8 [[CONV_AAA]], i8* [[CONV]], align +// CHECK-DAG: [[REF_AAA:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_AAA_CASTED]], + +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]], i[[SZ]], i[[SZ]], [10 x i32]*)* [[OMP_OUTLINED6:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[REF_A]], i[[SZ]] [[REF_AA]], i[[SZ]] [[REF_AAA]], [10 x i32]* [[REF_B]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED6]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x i32]* {{.+}}) +// To reduce complexity, we're only going as far as validating the signature of the outlined parallel function. + +// CHECK: define internal void [[HVT5]] +// Create local storage for each capture. +// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]* +// CHECK: [[LOCAL_A_CASTED:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AA_CASTED:%.+]] = alloca i[[SZ]] +// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] +// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]] +// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] +// Store captures in the context. +// CHECK-64-DAG:[[CONV_AP:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* +// CHECK-DAG: [[CONV_AAP:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16* +// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], + +// CHECK-64-DAG:[[CONV_A:%.+]] = load i32, i32* [[CONV_AP]] +// CHECK-64-DAG:[[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_A_CASTED]] to i32* +// CHECK-64-DAG:store i32 [[CONV_A]], i32* [[CONV]], align +// CHECK-32-DAG:[[LOCAL_AV:%.+]] = load i32, i32* [[LOCAL_A]] +// CHECK-32-DAG:store i32 [[LOCAL_AV]], i32* [[LOCAL_A_CASTED]], align +// CHECK-DAG: [[REF_A:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_A_CASTED]], + +// CHECK-DAG: [[CONV_AA:%.+]] = load i16, i16* [[CONV_AAP]] +// CHECK-DAG: [[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA_CASTED]] to i16* +// CHECK-DAG: store i16 [[CONV_AA]], i16* [[CONV]], align +// CHECK-DAG: [[REF_AA:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_AA_CASTED]], + +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]], i[[SZ]], [10 x i32]*)* [[OMP_OUTLINED7:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[REF_A]], i[[SZ]] [[REF_AA]], [10 x i32]* [[REF_B]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED7]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x i32]* {{.+}}) +// To reduce complexity, we're only going as far as validating the signature of the outlined parallel function. + +#endif diff --git a/test/OpenMP/target_parallel_codegen_registration.cpp b/test/OpenMP/target_parallel_codegen_registration.cpp new file mode 100644 index 000000000000..6ba137ff10b6 --- /dev/null +++ b/test/OpenMP/target_parallel_codegen_registration.cpp @@ -0,0 +1,437 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// Test target parallel codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// Check that no target code is emmitted if no omptests flag was provided. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK-DAG: [[SA:%.+]] = type { [4 x i32] } +// CHECK-DAG: [[SB:%.+]] = type { [8 x i32] } +// CHECK-DAG: [[SC:%.+]] = type { [16 x i32] } +// CHECK-DAG: [[SD:%.+]] = type { [32 x i32] } +// CHECK-DAG: [[SE:%.+]] = type { [64 x i32] } +// CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] } +// CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] } +// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 } +// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* } +// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* } + +// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 } + +// CHECK-DAG: [[A1:@.+]] = internal global [[SA]] +// CHECK-DAG: [[A2:@.+]] = global [[SA]] +// CHECK-DAG: [[B1:@.+]] = global [[SB]] +// CHECK-DAG: [[B2:@.+]] = global [[SB]] +// CHECK-DAG: [[C1:@.+]] = internal global [[SC]] +// CHECK-DAG: [[D1:@.+]] = global [[SD]] +// CHECK-DAG: [[E1:@.+]] = global [[SE]] +// CHECK-DAG: [[T1:@.+]] = global [[ST1]] +// CHECK-DAG: [[T2:@.+]] = global [[ST2]] + +// CHECK-NTARGET-DAG: [[SA:%.+]] = type { [4 x i32] } +// CHECK-NTARGET-DAG: [[SB:%.+]] = type { [8 x i32] } +// CHECK-NTARGET-DAG: [[SC:%.+]] = type { [16 x i32] } +// CHECK-NTARGET-DAG: [[SD:%.+]] = type { [32 x i32] } +// CHECK-NTARGET-DAG: [[SE:%.+]] = type { [64 x i32] } +// CHECK-NTARGET-DAG: [[ST1:%.+]] = type { [228 x i32] } +// CHECK-NTARGET-DAG: [[ST2:%.+]] = type { [1128 x i32] } +// CHECK-NTARGET-NOT: type { i8*, i8*, % +// CHECK-NTARGET-NOT: type { i32, % + +// We have 7 target regions + +// CHECK-DAG: {{@.+}} = private constant i8 0 +// TCHECK-NOT: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] + +// CHECK-NTARGET-NOT: private constant i8 0 +// CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i + +// CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00" +// CHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00" +// CHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00" +// CHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00" +// CHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00" +// CHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00" +// CHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00" +// CHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00" +// CHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00" +// CHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00" +// CHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00" +// CHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00" +// CHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 + +// TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00" +// TCHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00" +// TCHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00" +// TCHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00" +// TCHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00" +// TCHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00" +// TCHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00" +// TCHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00" +// TCHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00" +// TCHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00" +// TCHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00" +// TCHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00" +// TCHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 + +// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]] +// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]] +// CHECK: [[DEVBEGIN:@.+]] = external constant i8 +// CHECK: [[DEVEND:@.+]] = external constant i8 +// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }] +// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] } + +// We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function. +// CHECK: @llvm.global_ctors = appending global [4 x { i32, void ()*, i8* }] [ +// CHECK-SAME: { i32, void ()*, i8* } { i32 500, void ()* [[P500:@[^,]+]], i8* null }, +// CHECK-SAME: { i32, void ()*, i8* } { i32 501, void ()* [[P501:@[^,]+]], i8* null }, +// CHECK-SAME: { i32, void ()*, i8* } { i32 65535, void ()* [[PMAX:@[^,]+]], i8* null }, +// CHECK-SAME: { i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* [[REGFN:@.+]] to void ()*), i8* null }] + +// CHECK-NTARGET: @llvm.global_ctors = appending global [3 x { i32, void ()*, i8* }] [ + +extern int *R; + +struct SA { + int arr[4]; + void foo() { + int a = *R; + a += 1; + *R = a; + } + SA() { + int a = *R; + a += 2; + *R = a; + } + ~SA() { + int a = *R; + a += 3; + *R = a; + } +}; + +struct SB { + int arr[8]; + void foo() { + int a = *R; + #pragma omp target parallel + a += 4; + *R = a; + } + SB() { + int a = *R; + a += 5; + *R = a; + } + ~SB() { + int a = *R; + a += 6; + *R = a; + } +}; + +struct SC { + int arr[16]; + void foo() { + int a = *R; + a += 7; + *R = a; + } + SC() { + int a = *R; + #pragma omp target parallel + a += 8; + *R = a; + } + ~SC() { + int a = *R; + a += 9; + *R = a; + } +}; + +struct SD { + int arr[32]; + void foo() { + int a = *R; + a += 10; + *R = a; + } + SD() { + int a = *R; + a += 11; + *R = a; + } + ~SD() { + int a = *R; + #pragma omp target parallel + a += 12; + *R = a; + } +}; + +struct SE { + int arr[64]; + void foo() { + int a = *R; + #pragma omp target parallel if(target: 0) + a += 13; + *R = a; + } + SE() { + int a = *R; + #pragma omp target parallel + a += 14; + *R = a; + } + ~SE() { + int a = *R; + #pragma omp target parallel + a += 15; + *R = a; + } +}; + +template <int x> +struct ST { + int arr[128 + x]; + void foo() { + int a = *R; + #pragma omp target parallel + a += 16 + x; + *R = a; + } + ST() { + int a = *R; + #pragma omp target parallel + a += 17 + x; + *R = a; + } + ~ST() { + int a = *R; + #pragma omp target parallel + a += 18 + x; + *R = a; + } +}; + +// We have to make sure we us all the target regions: +//CHECK-DAG: define internal void @[[NAME1]]( +//CHECK-DAG: call void @[[NAME1]]( +//CHECK-DAG: define internal void @[[NAME2]]( +//CHECK-DAG: call void @[[NAME2]]( +//CHECK-DAG: define internal void @[[NAME3]]( +//CHECK-DAG: call void @[[NAME3]]( +//CHECK-DAG: define internal void @[[NAME4]]( +//CHECK-DAG: call void @[[NAME4]]( +//CHECK-DAG: define internal void @[[NAME5]]( +//CHECK-DAG: call void @[[NAME5]]( +//CHECK-DAG: define internal void @[[NAME6]]( +//CHECK-DAG: call void @[[NAME6]]( +//CHECK-DAG: define internal void @[[NAME7]]( +//CHECK-DAG: call void @[[NAME7]]( +//CHECK-DAG: define internal void @[[NAME8]]( +//CHECK-DAG: call void @[[NAME8]]( +//CHECK-DAG: define internal void @[[NAME9]]( +//CHECK-DAG: call void @[[NAME9]]( +//CHECK-DAG: define internal void @[[NAME10]]( +//CHECK-DAG: call void @[[NAME10]]( +//CHECK-DAG: define internal void @[[NAME11]]( +//CHECK-DAG: call void @[[NAME11]]( +//CHECK-DAG: define internal void @[[NAME12]]( +//CHECK-DAG: call void @[[NAME12]]( + +//TCHECK-DAG: define void @[[NAME1]]( +//TCHECK-DAG: define void @[[NAME2]]( +//TCHECK-DAG: define void @[[NAME3]]( +//TCHECK-DAG: define void @[[NAME4]]( +//TCHECK-DAG: define void @[[NAME5]]( +//TCHECK-DAG: define void @[[NAME6]]( +//TCHECK-DAG: define void @[[NAME7]]( +//TCHECK-DAG: define void @[[NAME8]]( +//TCHECK-DAG: define void @[[NAME9]]( +//TCHECK-DAG: define void @[[NAME10]]( +//TCHECK-DAG: define void @[[NAME11]]( +//TCHECK-DAG: define void @[[NAME12]]( + +// CHECK-NTARGET-NOT: __tgt_target +// CHECK-NTARGET-NOT: __tgt_register_lib +// CHECK-NTARGET-NOT: __tgt_unregister_lib + +// TCHECK-NOT: __tgt_target +// TCHECK-NOT: __tgt_register_lib +// TCHECK-NOT: __tgt_unregister_lib + +// We have 2 initializers with priority 500 +//CHECK: define internal void [[P500]]( +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK-NOT: call void @{{.+}}() +//CHECK: ret void + +// We have 1 initializers with priority 501 +//CHECK: define internal void [[P501]]( +//CHECK: call void @{{.+}}() +//CHECK-NOT: call void @{{.+}}() +//CHECK: ret void + +// We have 6 initializers with default priority +//CHECK: define internal void [[PMAX]]( +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK-NOT: call void @{{.+}}() +//CHECK: ret void + +// Check registration and unregistration + +//CHECK: define internal void [[UNREGFN:@.+]](i8*) +//CHECK: call i32 @__tgt_unregister_lib([[DSCTY]]* [[DESC]]) +//CHECK: ret void +//CHECK: declare i32 @__tgt_unregister_lib([[DSCTY]]*) + +//CHECK: define internal void [[REGFN]](i8*) +//CHECK: call i32 @__tgt_register_lib([[DSCTY]]* [[DESC]]) +//CHECK: call i32 @__cxa_atexit(void (i8*)* [[UNREGFN]], i8* bitcast ([[DSCTY]]* [[DESC]] to i8*), +//CHECK: ret void +//CHECK: declare i32 @__tgt_register_lib([[DSCTY]]*) + +static __attribute__((init_priority(500))) SA a1; +SA a2; +SB __attribute__((init_priority(500))) b1; +SB __attribute__((init_priority(501))) b2; +static SC c1; +SD d1; +SE e1; +ST<100> t1; +ST<1000> t2; + + +int bar(int a){ + int r = a; + + a1.foo(); + a2.foo(); + b1.foo(); + b2.foo(); + c1.foo(); + d1.foo(); + e1.foo(); + t1.foo(); + t2.foo(); + + #pragma omp target parallel + ++r; + + return r + *R; +} + +// Check metadata is properly generated: +// CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 {{[0-9]+}}} + +// TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 {{[0-9]+}}} + +#endif diff --git a/test/OpenMP/target_parallel_codegen_registration_naming.cpp b/test/OpenMP/target_parallel_codegen_registration_naming.cpp new file mode 100644 index 000000000000..4b7940290d07 --- /dev/null +++ b/test/OpenMP/target_parallel_codegen_registration_naming.cpp @@ -0,0 +1,66 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// Test target parallel codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK: [[CA:%.+]] = type { i32* } + +// CHECK: define {{.*}}i32 @[[NNAME:.+]](i32 {{.*}}%{{.+}}) +int nested(int a){ + // CHECK: call void @__omp_offloading_[[FILEID:[0-9a-f]+_[0-9a-f]+]]_[[NNAME]]_l[[T1L:[0-9]+]]( + #pragma omp target parallel + ++a; + + // CHECK: call void @"[[LNAME:.+]]"([[CA]]* + auto F = [&](){ + #pragma omp parallel + { + #pragma omp target parallel + ++a; + } + }; + + F(); + + return a; +} + +// CHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T1L]]( +// TCHECK: define {{.*}}void @__omp_offloading_[[FILEID:[0-9a-f]+_[0-9a-f]+]]_[[NNAME:.+]]_l[[T1L:[0-9]+]]( + +// CHECK: define {{.*}}void @"[[LNAME]]"( +// CHECK: call void {{.*}}@__kmpc_fork_call{{.+}}[[PNAME:@.+]] to + +// CHECK: define {{.*}}void [[PNAME]]( +// CHECK: call void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T2L:[0-9]+]]( + +// CHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T2L]]( +// TCHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME:.+]]_l[[T2L:[0-9]+]]( + + +// Check metadata is properly generated: +// CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 {{[0-9]+}}} + +// TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 {{[0-9]+}}} +#endif diff --git a/test/OpenMP/target_parallel_default_messages.cpp b/test/OpenMP/target_parallel_default_messages.cpp index 40f31b84848a..cf497937f3e0 100644 --- a/test/OpenMP/target_parallel_default_messages.cpp +++ b/test/OpenMP/target_parallel_default_messages.cpp @@ -23,5 +23,8 @@ int main(int argc, char **argv) { foo(); #pragma omp target parallel default(shared) ++argc; + #pragma omp target parallel default(none) + #pragma omp parallel default(shared) + ++argc; return 0; } diff --git a/test/OpenMP/target_parallel_for_collapse_messages.cpp b/test/OpenMP/target_parallel_for_collapse_messages.cpp index 8cf502be0d9c..6163b52fd21c 100644 --- a/test/OpenMP/target_parallel_for_collapse_messages.cpp +++ b/test/OpenMP/target_parallel_for_collapse_messages.cpp @@ -1,9 +1,14 @@ // RUN: %clang_cc1 -verify -fopenmp %s +// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s +// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s void foo() { } bool foobool(int argc) { +#if __cplusplus >= 201103L +// expected-note@-2 4 {{declared here}} +#endif return argc; } @@ -33,10 +38,17 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}} // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}} // expected-error@+1 2 {{expression is not an integral constant expression}} #pragma omp target parallel for collapse (foobool(argc)), collapse (true), collapse (-5) +#if __cplusplus >= 201103L +// expected-note@-2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}} +#endif for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; #pragma omp target parallel for collapse (S) // expected-error {{'S' does not refer to a value}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; - // expected-error@+1 2 {{expression is not an integral constant expression}} +#if __cplusplus >= 201103L + // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}} +#else + // expected-error@+2 2 {{expression is not an integral constant expression}} +#endif #pragma omp target parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; #pragma omp target parallel for collapse (1) @@ -60,15 +72,25 @@ int main(int argc, char **argv) { #pragma omp target parallel for collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}} expected-note {{as specified in 'collapse' clause}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}} #pragma omp target parallel for collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}} +#if __cplusplus >= 201103L +// expected-note@-2 {{non-constexpr function 'foobool' cannot be used in a constant expression}} +#endif for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error@+3 {{expression is not an integral constant expression}} // expected-error@+2 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'collapse' clause}} // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}} #pragma omp target parallel for collapse (foobool(argc)), collapse (true), collapse (-5) +#if __cplusplus >= 201103L +// expected-note@-2 {{non-constexpr function 'foobool' cannot be used in a constant expression}} +#endif for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; #pragma omp target parallel for collapse (S1) // expected-error {{'S1' does not refer to a value}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; - // expected-error@+1 {{expression is not an integral constant expression}} +#if __cplusplus >= 201103L + // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}} +#else + // expected-error@+2 {{expression is not an integral constant expression}} +#endif #pragma omp target parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error@+3 {{statement after '#pragma omp target parallel for' must be a for loop}} diff --git a/test/OpenMP/target_parallel_for_ordered_messages.cpp b/test/OpenMP/target_parallel_for_ordered_messages.cpp index 36eb8371e025..72bb6e34912d 100644 --- a/test/OpenMP/target_parallel_for_ordered_messages.cpp +++ b/test/OpenMP/target_parallel_for_ordered_messages.cpp @@ -1,9 +1,14 @@ // RUN: %clang_cc1 -verify -fopenmp %s +// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s +// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s void foo() { } bool foobool(int argc) { +#if __cplusplus >= 201103L +// expected-note@-2 4 {{declared here}} +#endif return argc; } @@ -36,6 +41,9 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here} #pragma omp target parallel for ordered((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'ordered' clause}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp target parallel for', but found only 1}} +#if __cplusplus >= 201103L +// expected-note@+5 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}} +#endif // expected-error@+3 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'ordered' clause}} // expected-error@+2 2 {{argument to 'ordered' clause must be a strictly positive integer value}} // expected-error@+1 2 {{expression is not an integral constant expression}} @@ -45,7 +53,11 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here} #pragma omp target parallel for ordered(S) // expected-error {{'S' does not refer to a value}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; -// expected-error@+1 2 {{expression is not an integral constant expression}} +#if __cplusplus >= 201103L + // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}} +#else + // expected-error@+2 2 {{expression is not an integral constant expression}} +#endif #pragma omp target parallel for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; @@ -76,9 +88,15 @@ int main(int argc, char **argv) { #pragma omp target parallel for ordered(2 + 2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}} expected-note {{as specified in 'ordered' clause}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i - 4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}} +#if __cplusplus >= 201103L +// expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}} +#endif #pragma omp target parallel for ordered(foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i - 4]; +#if __cplusplus >= 201103L +// expected-note@+5 {{non-constexpr function 'foobool' cannot be used in a constant expression}} +#endif // expected-error@+3 {{expression is not an integral constant expression}} // expected-error@+2 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'ordered' clause}} // expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}} @@ -88,7 +106,11 @@ int main(int argc, char **argv) { #pragma omp target parallel for ordered(S1) // expected-error {{'S1' does not refer to a value}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i - 4]; -// expected-error@+1 {{expression is not an integral constant expression}} +#if __cplusplus >= 201103L + // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}} +#else + // expected-error@+2 {{expression is not an integral constant expression}} +#endif #pragma omp target parallel for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i - 4]; diff --git a/test/OpenMP/target_parallel_if_codegen.cpp b/test/OpenMP/target_parallel_if_codegen.cpp new file mode 100644 index 000000000000..02c69b95e19c --- /dev/null +++ b/test/OpenMP/target_parallel_if_codegen.cpp @@ -0,0 +1,413 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK-DAG: %ident_t = type { i32, i32, i32, i32, i8* } +// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00" +// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) } + +// CHECK-DAG: [[S1:%.+]] = type { double } +// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 } +// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* } +// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* } + +// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 } + +// We have 6 target regions + +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 + +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] + +// Check if offloading descriptor is created. +// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]] +// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]] +// CHECK: [[DEVBEGIN:@.+]] = external constant i8 +// CHECK: [[DEVEND:@.+]] = external constant i8 +// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }] +// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] } + +// Check target registration is registered as a Ctor. +// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* [[REGFN:@.+]] to void ()*), i8* null }] + + +template<typename tx> +tx ftemplate(int n) { + tx a = 0; + + #pragma omp target parallel if(parallel: 0) + { + a += 1; + } + + short b = 1; + #pragma omp target parallel if(parallel: 1) + { + a += b; + } + + return a; +} + +static +int fstatic(int n) { + + #pragma omp target parallel if(n>1) + { + } + + #pragma omp target parallel if(target: n-2>2) + { + } + + return n+1; +} + +struct S1 { + double a; + + int r1(int n){ + int b = 1; + + #pragma omp target parallel if(parallel: n>3) + { + this->a = (double)b + 1.5; + } + + #pragma omp target parallel if(target: n>4) if(parallel: n>5) + { + this->a = 2.5; + } + + return (int)a; + } +}; + +// CHECK: define {{.*}}@{{.*}}bar{{.*}} +int bar(int n){ + int a = 0; + + S1 S; + // CHECK: call {{.*}}i32 [[FS1:@.+]]([[S1]]* {{.*}}, i32 {{.*}}) + a += S.r1(n); + + // CHECK: call {{.*}}i32 [[FSTATIC:@.+]](i32 {{.*}}) + a += fstatic(n); + + // CHECK: call {{.*}}i32 [[FTEMPLATE:@.+]](i32 {{.*}}) + a += ftemplate<int>(n); + + return a; +} + + + +// +// CHECK: define {{.*}}[[FS1]]([[S1]]* {{%.+}}, i32 {{[^%]*}}[[PARM:%.+]]) +// +// CHECK-DAG: store i32 [[PARM]], i32* [[N_ADDR:%.+]], align +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[CMP:%.+]] = icmp sgt i32 [[NV]], 3 +// CHECK: [[FB:%.+]] = zext i1 [[CMP]] to i8 +// CHECK: store i8 [[FB]], i8* [[CAPE_ADDR:%.+]], align +// CHECK: [[CAPE:%.+]] = load i8, i8* [[CAPE_ADDR]], align +// CHECK: [[TB:%.+]] = trunc i8 [[CAPE]] to i1 +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i8* +// CHECK: [[FB:%.+]] = zext i1 [[TB]] to i8 +// CHECK: store i8 [[FB]], i8* [[CONV]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, {{.*}}, i32 1, i32 0) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT1:@.+]]([[S1]]* {{%.+}}, i[[SZ]] {{%.+}}, i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// +// +// +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[CMP:%.+]] = icmp sgt i32 [[NV]], 5 +// CHECK: [[FB:%.+]] = zext i1 [[CMP]] to i8 +// CHECK: store i8 [[FB]], i8* [[CAPE_ADDR:%.+]], align +// CHECK: [[CAPE:%.+]] = load i8, i8* [[CAPE_ADDR]], align +// CHECK: [[TB:%.+]] = trunc i8 [[CAPE]] to i1 +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i8* +// CHECK: [[FB:%.+]] = zext i1 [[TB]] to i8 +// CHECK: store i8 [[FB]], i8* [[CONV]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[CMP:%.+]] = icmp sgt i32 [[NV]], 4 +// CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] +// +// CHECK: [[IF_THEN]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, {{.*}}, i32 1, i32 0) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: br label {{%?}}[[END:.+]] +// +// CHECK: [[IF_ELSE]] +// CHECK: store i32 -1, i32* [[RHV]], align +// CHECK: br label {{%?}}[[END]] +// +// CHECK: [[END]] +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT2:@.+]]([[S1]]* {{%.+}}, i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] + + + + + + +// +// CHECK: define {{.*}}[[FSTATIC]](i32 {{[^%]*}}[[PARM:%.+]]) +// +// CHECK-DAG: store i32 [[PARM]], i32* [[N_ADDR:%.+]], align +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[CMP:%.+]] = icmp sgt i32 [[NV]], 1 +// CHECK: [[FB:%.+]] = zext i1 [[CMP]] to i8 +// CHECK: store i8 [[FB]], i8* [[CAPE_ADDR:%.+]], align +// CHECK: [[CAPE:%.+]] = load i8, i8* [[CAPE_ADDR]], align +// CHECK: [[TB:%.+]] = trunc i8 [[CAPE]] to i1 +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i8* +// CHECK: [[FB:%.+]] = zext i1 [[TB]] to i8 +// CHECK: store i8 [[FB]], i8* [[CONV]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[CAPE2:%.+]] = load i8, i8* [[CAPE_ADDR]], align +// CHECK: [[TB:%.+]] = trunc i8 [[CAPE2]] to i1 +// CHECK: br i1 [[TB]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] +// +// CHECK: [[IF_THEN]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, {{.*}}, i32 1, i32 0) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: br label {{%?}}[[END:.+]] +// +// CHECK: [[IF_ELSE]] +// CHECK: store i32 -1, i32* [[RHV]], align +// CHECK: br label {{%?}}[[END]] +// +// CHECK: [[END]] +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT3:@.+]](i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// +// +// +// CHECK-DAG: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[SUB:%.+]] = sub nsw i32 [[NV]], 2 +// CHECK: [[CMP:%.+]] = icmp sgt i32 [[SUB]], 2 +// CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] +// +// CHECK: [[IF_THEN]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 0, {{.*}}, i32 1, i32 0) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: br label {{%?}}[[END:.+]] +// +// CHECK: [[IF_ELSE]] +// CHECK: store i32 -1, i32* [[RHV]], align +// CHECK: br label {{%?}}[[END]] +// +// CHECK: [[END]] +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT4:@.+]]() +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] + + + + + + +// +// CHECK: define {{.*}}[[FTEMPLATE]] +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, {{.*}}, i32 1, i32 0) +// CHECK-NEXT: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK-NEXT: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT5:@.+]]({{[^,]+}}) +// CHECK: br label {{%?}}[[END]] +// +// CHECK: [[END]] +// +// +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, {{.*}}, i32 1, i32 0) +// CHECK-NEXT: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK-NEXT: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT6:@.+]]({{[^,]+}}, {{[^,]+}}) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] + + + + + + +// Check that the offloading functions are emitted and that the parallel function +// is appropriately guarded. + +// CHECK: define internal void [[HVT1]]([[S1]]* {{%.+}}, i[[SZ]] [[PARM1:%.+]], i[[SZ]] [[PARM2:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM1]], i[[SZ]]* [[B_ADDR:%.+]], align +// CHECK-DAG: store i[[SZ]] [[PARM2]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK-64: [[CONVB:%.+]] = bitcast i[[SZ]]* [[B_ADDR]] to i32* +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i8* +// CHECK-64: [[BV:%.+]] = load i32, i32* [[CONVB]], align +// CHECK-32: [[BV:%.+]] = load i32, i32* [[B_ADDR]], align +// CHECK-64: [[BC:%.+]] = bitcast i64* [[ARGA:%.+]] to i32* +// CHECK-64: store i32 [[BV]], i32* [[BC]], align +// CHECK-64: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[ARGA]], align +// CHECK-32: store i32 [[BV]], i32* [[ARGA:%.+]], align +// CHECK-32: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[ARGA]], align +// CHECK: [[IFC:%.+]] = load i8, i8* [[CONV]], align +// CHECK: [[TB:%.+]] = trunc i8 [[IFC]] to i1 +// CHECK: br i1 [[TB]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] +// +// CHECK: [[IF_THEN]] +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [[S1]]*, i[[SZ]])* [[OMP_OUTLINED3:@.+]] to void (i32*, i32*, ...)*), [[S1]]* {{.+}}, i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END:.+]] +// +// CHECK: [[IF_ELSE]] +// CHECK: call void @__kmpc_serialized_parallel( +// CHECK: call void [[OMP_OUTLINED3]](i32* {{%.+}}, i32* {{%.+}}, [[S1]]* {{.+}}, i[[SZ]] [[ARG]]) +// CHECK: call void @__kmpc_end_serialized_parallel( +// CHECK: br label {{%?}}[[END]] +// +// CHECK: [[END]] +// +// + + +// CHECK: define internal void [[HVT2]]([[S1]]* {{%.+}}, i[[SZ]] [[PARM:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i8* +// CHECK: [[IFC:%.+]] = load i8, i8* [[CONV]], align +// CHECK: [[TB:%.+]] = trunc i8 [[IFC]] to i1 +// CHECK: br i1 [[TB]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] +// +// CHECK: [[IF_THEN]] +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [[S1]]*)* [[OMP_OUTLINED4:@.+]] to void (i32*, i32*, ...)*), [[S1]]* {{.+}}) +// CHECK: br label {{%?}}[[END:.+]] +// +// CHECK: [[IF_ELSE]] +// CHECK: call void @__kmpc_serialized_parallel( +// CHECK: call void [[OMP_OUTLINED4]](i32* {{%.+}}, i32* {{%.+}}, [[S1]]* {{.+}}) +// CHECK: call void @__kmpc_end_serialized_parallel( +// CHECK: br label {{%?}}[[END]] +// +// CHECK: [[END]] +// +// + + + + + + + + +// CHECK: define internal void [[HVT3]](i[[SZ]] [[PARM:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i8* +// CHECK: [[IFC:%.+]] = load i8, i8* [[CONV]], align +// CHECK: [[TB:%.+]] = trunc i8 [[IFC]] to i1 +// CHECK: br i1 [[TB]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] +// +// CHECK: [[IF_THEN]] +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* [[OMP_OUTLINED1:@.+]] to void (i32*, i32*, ...)*)) +// CHECK: br label {{%?}}[[END:.+]] +// +// CHECK: [[IF_ELSE]] +// CHECK: call void @__kmpc_serialized_parallel( +// CHECK: call void [[OMP_OUTLINED1]](i32* {{%.+}}, i32* {{%.+}}) +// CHECK: call void @__kmpc_end_serialized_parallel( +// CHECK: br label {{%?}}[[END]] +// +// CHECK: [[END]] +// +// +// CHECK: define internal void [[HVT4]]() +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* [[OMP_OUTLINED2:@.+]] to void (i32*, i32*, ...)*)) +// CHECK-NEXT: ret +// +// + + + + + +// CHECK: define internal void [[HVT5]]( +// CHECK-NOT: @__kmpc_fork_call +// CHECK: call void @__kmpc_serialized_parallel( +// CHECK: call void [[OMP_OUTLINED5:@.+]](i32* {{%.+}}, i32* {{%.+}}, i[[SZ]] {{.+}}) +// CHECK: call void @__kmpc_end_serialized_parallel( +// CHECK: ret +// +// + + +// CHECK: define internal void [[HVT6]]( +// CHECK-NOT: call void @__kmpc_serialized_parallel( +// CHECK-NOT: call void [[OMP_OUTLINED5:@.+]](i32* {{%.+}}, i32* {{%.+}}, i[[SZ]] {{.+}}) +// CHECK-NOT: call void @__kmpc_end_serialized_parallel( +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]], i[[SZ]])* [[OMP_OUTLINED5:@.+]] to void (i32*, i32*, ...)*), +// CHECK: ret +// +// + + + +#endif diff --git a/test/OpenMP/target_parallel_num_threads_codegen.cpp b/test/OpenMP/target_parallel_num_threads_codegen.cpp new file mode 100644 index 000000000000..de6a13d087ab --- /dev/null +++ b/test/OpenMP/target_parallel_num_threads_codegen.cpp @@ -0,0 +1,344 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK-DAG: %ident_t = type { i32, i32, i32, i32, i8* } +// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00" +// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) } + +// CHECK-DAG: [[S1:%.+]] = type { double } +// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 } +// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* } +// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* } + +// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 } + +// We have 6 target regions + +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 + +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] + +// Check if offloading descriptor is created. +// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]] +// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]] +// CHECK: [[DEVBEGIN:@.+]] = external constant i8 +// CHECK: [[DEVEND:@.+]] = external constant i8 +// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }] +// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] } + +// Check target registration is registered as a Ctor. +// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* [[REGFN:@.+]] to void ()*), i8* null }] + + +template<typename tx> +tx ftemplate(int n) { + tx a = 0; + + #pragma omp target parallel num_threads(tx(20)) + { + } + + short b = 1; + #pragma omp target parallel num_threads(b) + { + a += b; + } + + return a; +} + +static +int fstatic(int n) { + + #pragma omp target parallel num_threads(n) + { + } + + #pragma omp target parallel num_threads(32+n) + { + } + + return n+1; +} + +struct S1 { + double a; + + int r1(int n){ + int b = 1; + + #pragma omp target parallel num_threads(n-b) + { + this->a = (double)b + 1.5; + } + + #pragma omp target parallel num_threads(1024) + { + this->a = 2.5; + } + + return (int)a; + } +}; + +// CHECK: define {{.*}}@{{.*}}bar{{.*}} +int bar(int n){ + int a = 0; + + S1 S; + // CHECK: call {{.*}}i32 [[FS1:@.+]]([[S1]]* {{.*}}, i32 {{.*}}) + a += S.r1(n); + + // CHECK: call {{.*}}i32 [[FSTATIC:@.+]](i32 {{.*}}) + a += fstatic(n); + + // CHECK: call {{.*}}i32 [[FTEMPLATE:@.+]](i32 {{.*}}) + a += ftemplate<int>(n); + + return a; +} + + + +// +// CHECK: define {{.*}}[[FS1]]([[S1]]* {{%.+}}, i32 {{[^%]*}}[[PARM:%.+]]) +// +// CHECK-DAG: store i32 [[PARM]], i32* [[N_ADDR:%.+]], align +// CHECK: store i32 1, i32* [[B:%.+]], align +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[BV:%.+]] = load i32, i32* [[B]], align +// CHECK: [[SUB:%.+]] = sub nsw i32 [[NV]], [[BV]] +// CHECK: store i32 [[SUB]], i32* [[CAPE_ADDR:%.+]], align +// CHECK: [[CEV:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i32* +// CHECK-64: store i32 [[CEV]], i32* [[CONV]], align +// CHECK-32: store i32 [[CEV]], i32* [[CAPEC_ADDR:%.+]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[THREADS:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, {{.*}}, i32 1, i32 [[THREADS]]) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT1:@.+]]([[S1]]* {{%.+}}, i[[SZ]] {{%.+}}, i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// +// +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, {{.+}}, i32 1, i32 1024) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT2:@.+]]([[S1]]* {{[^,]+}}) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// + + + + + + +// +// CHECK: define {{.*}}[[FSTATIC]](i32 {{[^%]*}}[[PARM:%.+]]) +// +// CHECK-DAG: store i32 [[PARM]], i32* [[N_ADDR:%.+]], align +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: store i32 [[NV]], i32* [[CAPE_ADDR:%.+]], align +// CHECK: [[CEV:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i32* +// CHECK-64: store i32 [[CEV]], i32* [[CONV]], align +// CHECK-32: store i32 [[CEV]], i32* [[CAPEC_ADDR:%.+]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[THREADS:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, {{.*}}, i32 1, i32 [[THREADS]]) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT3:@.+]](i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// +// +// +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[ADD:%.+]] = add nsw i32 32, [[NV]] +// CHECK: store i32 [[ADD]], i32* [[CAPE_ADDR:%.+]], align +// CHECK: [[CEV:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i32* +// CHECK-64: store i32 [[CEV]], i32* [[CONV]], align +// CHECK-32: store i32 [[CEV]], i32* [[CAPEC_ADDR:%.+]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[THREADS:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, {{.*}}, i32 1, i32 [[THREADS]]) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT4:@.+]](i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// + + + + + + +// +// CHECK: define {{.*}}[[FTEMPLATE]] +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 0, {{.*}}, i32 1, i32 20) +// CHECK-NEXT: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK-NEXT: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT5:@.+]]() +// CHECK: br label {{%?}}[[END]] +// +// CHECK: [[END]] +// +// +// +// CHECK: store i16 1, i16* [[B:%.+]], align +// CHECK: [[BV:%.+]] = load i16, i16* [[B]], align +// CHECK: store i16 [[BV]], i16* [[CAPE_ADDR:%.+]], align +// CHECK: [[CEV:%.+]] = load i16, i16* [[CAPE_ADDR]], align +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i16* +// CHECK: store i16 [[CEV]], i16* [[CONV]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[T:%.+]] = load i16, i16* [[CAPE_ADDR]], align +// CHECK: [[THREADS:%.+]] = sext i16 [[T]] to i32 +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, {{.*}}, i32 1, i32 [[THREADS]]) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT6:@.+]](i[[SZ]] {{%.+}}, i[[SZ]] {{%.+}}, i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// + + + + + + +// Check that the offloading functions are emitted and that the parallel function +// is appropriately guarded. + +// CHECK: define internal void [[HVT1]]([[S1]]* {{%.+}}, i[[SZ]] [[PARM1:%.+]], i[[SZ]] [[PARM2:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM2]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i32* +// CHECK-64: [[NT:%.+]] = load i32, i32* [[CONV]], align +// CHECK-32: [[NT:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK: call void @__kmpc_push_num_threads(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 [[NT]]) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 2, +// +// + + +// CHECK: define internal void [[HVT2]]([[S1]]* {{%.+}}) +// CHECK: call void @__kmpc_push_num_threads(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 1024) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 1, +// +// + + + + + + + + +// CHECK: define internal void [[HVT3]](i[[SZ]] [[PARM:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i32* +// CHECK-64: [[NT:%.+]] = load i32, i32* [[CONV]], align +// CHECK-32: [[NT:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK: call void @__kmpc_push_num_threads(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 [[NT]]) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 0, +// +// +// CHECK: define internal void [[HVT4]](i[[SZ]] [[PARM:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i32* +// CHECK-64: [[NT:%.+]] = load i32, i32* [[CONV]], align +// CHECK-32: [[NT:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK: call void @__kmpc_push_num_threads(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 [[NT]]) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 0, +// +// + + + + + +// CHECK: define internal void [[HVT5]]( +// CHECK: call void @__kmpc_push_num_threads(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 20) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 0, +// +// + + +// CHECK: define internal void [[HVT6]](i[[SZ]] [[PARM1:%.+]], i[[SZ]] [[PARM2:%.+]], i[[SZ]] [[PARM3:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM3]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i16* +// CHECK: [[T:%.+]] = load i16, i16* [[CONV]], align +// CHECK: [[NT:%.+]] = sext i16 [[T]] to i32 +// CHECK: call void @__kmpc_push_num_threads(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 [[NT]]) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* [[DEF_LOC]], i32 2, +// +// + + + +#endif diff --git a/test/OpenMP/target_teams_codegen.cpp b/test/OpenMP/target_teams_codegen.cpp new file mode 100644 index 000000000000..6bac4e6e58c9 --- /dev/null +++ b/test/OpenMP/target_teams_codegen.cpp @@ -0,0 +1,802 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK-DAG: %ident_t = type { i32, i32, i32, i32, i8* } +// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00" +// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) } + +// CHECK-DAG: [[TT:%.+]] = type { i64, i8 } +// CHECK-DAG: [[S1:%.+]] = type { double } +// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 } +// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* } +// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* } + +// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 } + +// We have 8 target regions, but only 7 that actually will generate offloading +// code, only 6 will have mapped arguments, and only 4 have all-constant map +// sizes. + +// CHECK-DAG: [[SIZET2:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] [i[[SZ:32|64]] 2] +// CHECK-DAG: [[MAPT2:@.+]] = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: [[SIZET3:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2] +// CHECK-DAG: [[MAPT3:@.+]] = private unnamed_addr constant [2 x i32] [i32 288, i32 288] +// CHECK-DAG: [[MAPT4:@.+]] = private unnamed_addr constant [9 x i32] [i32 288, i32 35, i32 288, i32 35, i32 35, i32 288, i32 288, i32 35, i32 35] +// CHECK-DAG: [[SIZET5:@.+]] = private unnamed_addr constant [3 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 40] +// CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i32] [i32 288, i32 288, i32 35] +// CHECK-DAG: [[SIZET6:@.+]] = private unnamed_addr constant [4 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 1, i[[SZ]] 40] +// CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i32] [i32 288, i32 288, i32 288, i32 35] +// CHECK-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i32] [i32 35, i32 288, i32 288, i32 288, i32 35] +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 + +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK-NOT: @{{.+}} = constant [[ENTTY]] + +// Check if offloading descriptor is created. +// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]] +// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]] +// CHECK: [[DEVBEGIN:@.+]] = external constant i8 +// CHECK: [[DEVEND:@.+]] = external constant i8 +// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }] +// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] } + +// Check target registration is registered as a Ctor. +// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* [[REGFN:@.+]] to void ()*), i8* null }] + + +template<typename tx, typename ty> +struct TT{ + tx X; + ty Y; +}; + +// CHECK: define {{.*}}[[FOO:@.+]]( +int foo(int n) { + int a = 0; + short aa = 0; + float b[10]; + float bn[n]; + double c[5][10]; + double cn[5][n]; + TT<long long, char> d; + + // CHECK: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i32* null, i32 0, i32 0) + // CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 + // CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT0:@.+]]() + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target teams + { + } + + // CHECK: store i32 0, i32* [[RHV:%.+]], align 4 + // CHECK: store i32 -1, i32* [[RHV]], align 4 + // CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 + // CHECK: call void [[HVT1:@.+]](i[[SZ]] {{[^,]+}}) + #pragma omp target teams if(target: 0) + { + a += 1; + } + + // CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, i8** [[BP:%[^,]+]], i8** [[P:%[^,]+]], i[[SZ]]* getelementptr inbounds ([1 x i[[SZ]]], [1 x i[[SZ]]]* [[SIZET2]], i32 0, i32 0), i32* getelementptr inbounds ([1 x i32], [1 x i32]* [[MAPT2]], i32 0, i32 0), i32 0, i32 0) + // CHECK-DAG: [[BP]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BPR:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[P]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PR:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BPR]], i32 0, i32 [[IDX0:[0-9]+]] + // CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PR]], i32 0, i32 [[IDX0]] + // CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] + // CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] + // CHECK-DAG: [[BP0]] = inttoptr i[[SZ]] %{{.+}} to i8* + // CHECK-DAG: [[P0]] = inttoptr i[[SZ]] %{{.+}} to i8* + + // CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 + // CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT2:@.+]](i[[SZ]] {{[^,]+}}) + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target teams if(target: 1) + { + aa += 1; + } + + // CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 10 + // CHECK: br i1 [[IF]], label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]] + // CHECK: [[IFTHEN]] + // CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* getelementptr inbounds ([2 x i[[SZ]]], [2 x i[[SZ]]]* [[SIZET3]], i32 0, i32 0), i32* getelementptr inbounds ([2 x i32], [2 x i32]* [[MAPT3]], i32 0, i32 0), i32 0, i32 0) + // CHECK-DAG: [[BPR]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[PR]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%[^,]+]], i32 0, i32 0 + + // CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0 + // CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0 + // CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] + // CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] + // CHECK-DAG: [[BP0]] = inttoptr i[[SZ]] %{{.+}} to i8* + // CHECK-DAG: [[P0]] = inttoptr i[[SZ]] %{{.+}} to i8* + + // CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1 + // CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1 + // CHECK-DAG: store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]] + // CHECK-DAG: store i8* [[P1:%[^,]+]], i8** [[PADDR1]] + // CHECK-DAG: [[BP1]] = inttoptr i[[SZ]] %{{.+}} to i8* + // CHECK-DAG: [[P1]] = inttoptr i[[SZ]] %{{.+}} to i8* + // CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 + // CHECK-NEXT: br label %[[IFEND:.+]] + + // CHECK: [[IFELSE]] + // CHECK: store i32 -1, i32* [[RHV]], align 4 + // CHECK-NEXT: br label %[[IFEND:.+]] + + // CHECK: [[IFEND]] + // CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 + // CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT3:@.+]]({{[^,]+}}, {{[^,]+}}) + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target teams if(target: n>10) + { + a += 1; + aa += 1; + } + + // We capture 3 VLA sizes in this target region + // CHECK-64: [[A_VAL:%.+]] = load i32, i32* %{{.+}}, + // CHECK-64: [[A_ADDR:%.+]] = bitcast i[[SZ]]* [[A_CADDR:%.+]] to i32* + // CHECK-64: store i32 [[A_VAL]], i32* [[A_ADDR]], + // CHECK-64: [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]], + + // CHECK-32: [[A_VAL:%.+]] = load i32, i32* %{{.+}}, + // CHECK-32: store i32 [[A_VAL]], i32* [[A_CADDR:%.+]], + // CHECK-32: [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]], + + // CHECK: [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[VLA0:%.+]], 4 + // CHECK: [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[VLA1:%.+]] + // CHECK: [[CNSIZE:%.+]] = mul nuw i[[SZ]] [[CNELEMSIZE2]], 8 + + // CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20 + // CHECK: br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]] + // CHECK: [[TRY]] + // CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 9, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i32* getelementptr inbounds ([9 x i32], [9 x i32]* [[MAPT4]], i32 0, i32 0), i32 0, i32 0) + // CHECK-DAG: [[BPR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[PR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[SR]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S:%[^,]+]], i32 0, i32 0 + + // CHECK-DAG: [[SADDR0:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX0:[0-9]+]] + // CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX0]] + // CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX0]] + // CHECK-DAG: [[SADDR1:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX1:[0-9]+]] + // CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX1]] + // CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX1]] + // CHECK-DAG: [[SADDR2:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX2:[0-9]+]] + // CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX2]] + // CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX2]] + // CHECK-DAG: [[SADDR3:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX3:[0-9]+]] + // CHECK-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX3]] + // CHECK-DAG: [[PADDR3:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX3]] + // CHECK-DAG: [[SADDR4:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX4:[0-9]+]] + // CHECK-DAG: [[BPADDR4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX4]] + // CHECK-DAG: [[PADDR4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX4]] + // CHECK-DAG: [[SADDR5:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX5:[0-9]+]] + // CHECK-DAG: [[BPADDR5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX5]] + // CHECK-DAG: [[PADDR5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX5]] + // CHECK-DAG: [[SADDR6:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX6:[0-9]+]] + // CHECK-DAG: [[BPADDR6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX6]] + // CHECK-DAG: [[PADDR6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX6]] + // CHECK-DAG: [[SADDR7:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX7:[0-9]+]] + // CHECK-DAG: [[BPADDR7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX7]] + // CHECK-DAG: [[PADDR7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX7]] + // CHECK-DAG: [[SADDR8:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX8:[0-9]+]] + // CHECK-DAG: [[BPADDR8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX8]] + // CHECK-DAG: [[PADDR8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX8]] + + // The names below are not necessarily consistent with the names used for the + // addresses above as some are repeated. + // CHECK-DAG: [[BP0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8* + // CHECK-DAG: [[P0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8* + // CHECK-DAG: store i8* [[BP0]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P0]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP1:%[^,]+]] = inttoptr i[[SZ]] [[VLA1]] to i8* + // CHECK-DAG: [[P1:%[^,]+]] = inttoptr i[[SZ]] [[VLA1]] to i8* + // CHECK-DAG: store i8* [[BP1]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P1]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: store i8* inttoptr (i[[SZ]] 5 to i8*), i8** {{%[^,]+}} + // CHECK-DAG: store i8* inttoptr (i[[SZ]] 5 to i8*), i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP3:%[^,]+]] = inttoptr i[[SZ]] [[A_CVAL]] to i8* + // CHECK-DAG: [[P3:%[^,]+]] = inttoptr i[[SZ]] [[A_CVAL]] to i8* + // CHECK-DAG: store i8* [[BP3]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P3]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] 4, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP4:%[^,]+]] = bitcast [10 x float]* %{{.+}} to i8* + // CHECK-DAG: [[P4:%[^,]+]] = bitcast [10 x float]* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP4]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P4]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] 40, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP5:%[^,]+]] = bitcast float* %{{.+}} to i8* + // CHECK-DAG: [[P5:%[^,]+]] = bitcast float* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP5]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P5]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] [[BNSIZE]], i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP6:%[^,]+]] = bitcast [5 x [10 x double]]* %{{.+}} to i8* + // CHECK-DAG: [[P6:%[^,]+]] = bitcast [5 x [10 x double]]* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP6]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P6]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] 400, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP7:%[^,]+]] = bitcast double* %{{.+}} to i8* + // CHECK-DAG: [[P7:%[^,]+]] = bitcast double* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP7]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P7]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] [[CNSIZE]], i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP8:%[^,]+]] = bitcast [[TT]]* %{{.+}} to i8* + // CHECK-DAG: [[P8:%[^,]+]] = bitcast [[TT]]* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP8]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P8]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{12|16}}, i[[SZ]]* {{%[^,]+}} + + // CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 + // CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + + // CHECK: [[FAIL]] + // CHECK: call void [[HVT4:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}) + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target teams if(target: n>20) + { + a += 1; + b[2] += 1.0; + bn[3] += 1.0; + c[1][2] += 1.0; + cn[1][3] += 1.0; + d.X += 1; + d.Y += 1; + } + + return a; +} + +// Check that the offloading functions are emitted and that the arguments are +// correct and loaded correctly for the target regions in foo(). + +// CHECK: define internal void [[HVT0]]() +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*)) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED]](i32* noalias %.global_tid., i32* noalias %.bound_tid.) +// CHECK: ret void +// CHECK-NEXT: } + + +// CHECK: define internal void [[HVT1]](i[[SZ]] %{{.+}}) +// Create stack storage and store argument in there. +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: [[AA_CASTED:%.+]] = alloca i[[SZ]], align +// CHECK: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK-64: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32* +// CHECK-64: [[AA:%.+]] = load i32, i32* [[AA_CADDR]], align +// CHECK-32: [[AA:%.+]] = load i32, i32* [[AA_ADDR]], align +// CHECK-64: [[AA_C:%.+]] = bitcast i[[SZ]]* [[AA_CASTED]] to i32* +// CHECK-64: store i32 [[AA]], i32* [[AA_C]], align +// CHECK-32: store i32 [[AA]], i32* [[AA_CASTED]], align +// CHECK: [[PARAM:%.+]] = load i[[SZ]], i[[SZ]]* [[AA_CASTED]], align +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]])* [[OMP_OUTLINED1:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[PARAM]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}) +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK-64: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32* +// CHECK-64: [[AA:%.+]] = load i32, i32* [[AA_CADDR]], align +// CHECK-32: [[AA:%.+]] = load i32, i32* [[AA_ADDR]], align +// CHECK: ret void +// CHECK-NEXT: } + +// CHECK: define internal void [[HVT2]](i[[SZ]] %{{.+}}) +// Create stack storage and store argument in there. +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: [[AA_CASTED:%.+]] = alloca i[[SZ]], align +// CHECK: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* +// CHECK: [[AA:%.+]] = load i16, i16* [[AA_CADDR]], align +// CHECK: [[AA_C:%.+]] = bitcast i[[SZ]]* [[AA_CASTED]] to i16* +// CHECK: store i16 [[AA]], i16* [[AA_C]], align +// CHECK: [[PARAM:%.+]] = load i[[SZ]], i[[SZ]]* [[AA_CASTED]], align +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]])* [[OMP_OUTLINED2:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[PARAM]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}) +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* +// CHECK: [[AA:%.+]] = load i16, i16* [[AA_CADDR]], align +// CHECK: ret void +// CHECK-NEXT: } + +// CHECK: define internal void [[HVT3]] +// Create stack storage and store argument in there. +// CHECK: [[A_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: [[A_CASTED:%.+]] = alloca i[[SZ]], align +// CHECK: [[AA_CASTED:%.+]] = alloca i[[SZ]], align +// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[A_ADDR]], align +// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK-64-DAG:[[A_CADDR:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i32* +// CHECK-DAG: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* +// CHECK-64-DAG:[[A:%.+]] = load i32, i32* [[A_CADDR]], align +// CHECK-32-DAG:[[A:%.+]] = load i32, i32* [[A_ADDR]], align +// CHECK-64-DAG:[[A_C:%.+]] = bitcast i[[SZ]]* [[A_CASTED]] to i32* +// CHECK-64-DAG:store i32 [[A]], i32* [[A_C]], align +// CHECK-32-DAG:store i32 [[A]], i32* [[A_CASTED]], align +// CHECK-DAG: [[AA:%.+]] = load i16, i16* [[AA_CADDR]], align +// CHECK-DAG: [[AA_C:%.+]] = bitcast i[[SZ]]* [[AA_CASTED]] to i16* +// CHECK-DAG: store i16 [[AA]], i16* [[AA_C]], align +// CHECK-DAG: [[PARAM1:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CASTED]], align +// CHECK-DAG: [[PARAM2:%.+]] = load i[[SZ]], i[[SZ]]* [[AA_CASTED]], align +// CHECK-DAG: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]], i[[SZ]])* [[OMP_OUTLINED3:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[PARAM1]], i[[SZ]] [[PARAM2]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED3]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}) +// CHECK: [[A_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align +// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[A_ADDR]], align +// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align +// CHECK-64-DAG:[[A_CADDR:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i32* +// CHECK-DAG: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* +// CHECK: ret void +// CHECK-NEXT: } + +// CHECK: define internal void [[HVT4]] +// Create local storage for each capture. +// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_B:%.+]] = alloca [10 x float]* +// CHECK: [[LOCAL_VLA1:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_BN:%.+]] = alloca float* +// CHECK: [[LOCAL_C:%.+]] = alloca [5 x [10 x double]]* +// CHECK: [[LOCAL_VLA2:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_VLA3:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_CN:%.+]] = alloca double* +// CHECK: [[LOCAL_D:%.+]] = alloca [[TT]]* +// CHECK: [[LOCAL_A_CASTED:%.+]] = alloca i[[SZ]] +// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] +// CHECK-DAG: store [10 x float]* [[ARG_B:%.+]], [10 x float]** [[LOCAL_B]] +// CHECK-DAG: store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]] +// CHECK-DAG: store float* [[ARG_BN:%.+]], float** [[LOCAL_BN]] +// CHECK-DAG: store [5 x [10 x double]]* [[ARG_C:%.+]], [5 x [10 x double]]** [[LOCAL_C]] +// CHECK-DAG: store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]] +// CHECK-DAG: store i[[SZ]] [[ARG_VLA3:%.+]], i[[SZ]]* [[LOCAL_VLA3]] +// CHECK-DAG: store double* [[ARG_CN:%.+]], double** [[LOCAL_CN]] +// CHECK-DAG: store [[TT]]* [[ARG_D:%.+]], [[TT]]** [[LOCAL_D]] + +// CHECK-64-DAG:[[CONV_AP:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* +// CHECK-DAG: [[REF_B:%.+]] = load [10 x float]*, [10 x float]** [[LOCAL_B]], +// CHECK-DAG: [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]], +// CHECK-DAG: [[REF_BN:%.+]] = load float*, float** [[LOCAL_BN]], +// CHECK-DAG: [[REF_C:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[LOCAL_C]], +// CHECK-DAG: [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]], +// CHECK-DAG: [[VAL_VLA3:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA3]], +// CHECK-DAG: [[REF_CN:%.+]] = load double*, double** [[LOCAL_CN]], +// CHECK-DAG: [[REF_D:%.+]] = load [[TT]]*, [[TT]]** [[LOCAL_D]], + +// CHECK-64-DAG:[[CONV_A:%.+]] = load i32, i32* [[CONV_AP]] +// CHECK-64-DAG:[[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_A_CASTED]] to i32* +// CHECK-64-DAG:store i32 [[CONV_A]], i32* [[CONV]], align +// CHECK-32-DAG:[[LOCAL_AV:%.+]] = load i32, i32* [[LOCAL_A]] +// CHECK-32-DAG:store i32 [[LOCAL_AV]], i32* [[LOCAL_A_CASTED]], align +// CHECK-DAG: [[REF_A:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_A_CASTED]], + +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 9, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]], [10 x float]*, i[[SZ]], float*, [5 x [10 x double]]*, i[[SZ]], i[[SZ]], double*, [[TT]]*)* [[OMP_OUTLINED4:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[REF_A]], [10 x float]* [[REF_B]], i[[SZ]] [[VAL_VLA1]], float* [[REF_BN]], [5 x [10 x double]]* [[REF_C]], i[[SZ]] [[VAL_VLA2]], i[[SZ]] [[VAL_VLA3]], double* [[REF_CN]], [[TT]]* [[REF_D]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED4]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}, [10 x float]* {{.+}}, i[[SZ]] %{{.+}}, float* %{{.+}}, [5 x [10 x double]]* {{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, double* %{{.+}}, [[TT]]* {{.+}}) +// To reduce complexity, we're only going as far as validating the signature of the outlined parallel function. + +template<typename tx> +tx ftemplate(int n) { + tx a = 0; + short aa = 0; + tx b[10]; + + #pragma omp target teams if(target: n>40) + { + a += 1; + aa += 1; + b[2] += 1; + } + + return a; +} + +static +int fstatic(int n) { + int a = 0; + short aa = 0; + char aaa = 0; + int b[10]; + + #pragma omp target teams if(target: n>50) + { + a += 1; + aa += 1; + aaa += 1; + b[2] += 1; + } + + return a; +} + +struct S1 { + double a; + + int r1(int n){ + int b = n+1; + short int c[2][n]; + + #pragma omp target teams if(target: n>60) + { + this->a = (double)b + 1.5; + c[1][1] = ++a; + } + + return c[1][1] + (int)b; + } +}; + +// CHECK: define {{.*}}@{{.*}}bar{{.*}} +int bar(int n){ + int a = 0; + + // CHECK: call {{.*}}i32 [[FOO]](i32 {{.*}}) + a += foo(n); + + S1 S; + // CHECK: call {{.*}}i32 [[FS1:@.+]]([[S1]]* {{.*}}, i32 {{.*}}) + a += S.r1(n); + + // CHECK: call {{.*}}i32 [[FSTATIC:@.+]](i32 {{.*}}) + a += fstatic(n); + + // CHECK: call {{.*}}i32 [[FTEMPLATE:@.+]](i32 {{.*}}) + a += ftemplate<int>(n); + + return a; +} + +// +// CHECK: define {{.*}}[[FS1]] +// +// CHECK: i8* @llvm.stacksave() +// CHECK-64: [[B_ADDR:%.+]] = bitcast i[[SZ]]* [[B_CADDR:%.+]] to i32* +// CHECK-64: store i32 %{{.+}}, i32* [[B_ADDR]], +// CHECK-64: [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_CADDR]], + +// CHECK-32: store i32 %{{.+}}, i32* [[B_ADDR:%.+]], +// CHECK-32: [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_ADDR]], + +// We capture 2 VLA sizes in this target region +// CHECK: [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[VLA0:%.+]] +// CHECK: [[CSIZE:%.+]] = mul nuw i[[SZ]] [[CELEMSIZE2]], 2 + +// CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60 +// CHECK: br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]] +// CHECK: [[TRY]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i32* getelementptr inbounds ([5 x i32], [5 x i32]* [[MAPT7]], i32 0, i32 0), i32 0, i32 0) +// CHECK-DAG: [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0 +// CHECK-DAG: [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0 +// CHECK-DAG: [[SR]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S:%.+]], i32 0, i32 0 +// CHECK-DAG: [[SADDR0:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX0:[0-9]+]] +// CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX0]] +// CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX0]] +// CHECK-DAG: [[SADDR1:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX1:[0-9]+]] +// CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX1]] +// CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX1]] +// CHECK-DAG: [[SADDR2:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX2:[0-9]+]] +// CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX2]] +// CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX2]] +// CHECK-DAG: [[SADDR3:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX3:[0-9]+]] +// CHECK-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX3]] +// CHECK-DAG: [[PADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX3]] + +// The names below are not necessarily consistent with the names used for the +// addresses above as some are repeated. +// CHECK-DAG: [[BP0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8* +// CHECK-DAG: [[P0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8* +// CHECK-DAG: store i8* [[BP0]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P0]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: store i8* inttoptr (i[[SZ]] 2 to i8*), i8** {{%[^,]+}} +// CHECK-DAG: store i8* inttoptr (i[[SZ]] 2 to i8*), i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: [[BP2:%[^,]+]] = inttoptr i[[SZ]] [[B_CVAL]] to i8* +// CHECK-DAG: [[P2:%[^,]+]] = inttoptr i[[SZ]] [[B_CVAL]] to i8* +// CHECK-DAG: store i8* [[BP2]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P2]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] 4, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: [[BP3:%[^,]+]] = bitcast [[S1]]* %{{.+}} to i8* +// CHECK-DAG: [[P3:%[^,]+]] = bitcast [[S1]]* %{{.+}} to i8* +// CHECK-DAG: store i8* [[BP3]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P3]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] 8, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: [[BP4:%[^,]+]] = bitcast i16* %{{.+}} to i8* +// CHECK-DAG: [[P4:%[^,]+]] = bitcast i16* %{{.+}} to i8* +// CHECK-DAG: store i8* [[BP4]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P4]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] [[CSIZE]], i[[SZ]]* {{%[^,]+}} + +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 +// CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + +// CHECK: [[FAIL]] +// CHECK: call void [[HVT7:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}) +// CHECK-NEXT: br label %[[END]] +// CHECK: [[END]] + +// +// CHECK: define {{.*}}[[FSTATIC]] +// +// CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 50 +// CHECK: br i1 [[IF]], label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]] +// CHECK: [[IFTHEN]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 4, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* getelementptr inbounds ([4 x i[[SZ]]], [4 x i[[SZ]]]* [[SIZET6]], i32 0, i32 0), i32* getelementptr inbounds ([4 x i32], [4 x i32]* [[MAPT6]], i32 0, i32 0), i32 0, i32 0) +// CHECK-DAG: [[BPR]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP:%.+]], i32 0, i32 0 +// CHECK-DAG: [[PR]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P:%.+]], i32 0, i32 0 + +// CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 0 +// CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 0 +// CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] +// CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] +// CHECK-DAG: [[BP0]] = inttoptr i[[SZ]] [[VAL0:%.+]] to i8* +// CHECK-DAG: [[P0]] = inttoptr i[[SZ]] [[VAL0]] to i8* + +// CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 1 +// CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 1 +// CHECK-DAG: store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]] +// CHECK-DAG: store i8* [[P1:%[^,]+]], i8** [[PADDR1]] +// CHECK-DAG: [[BP1]] = inttoptr i[[SZ]] [[VAL1:%.+]] to i8* +// CHECK-DAG: [[P1]] = inttoptr i[[SZ]] [[VAL1]] to i8* + +// CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 2 +// CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 2 +// CHECK-DAG: store i8* [[BP2:%[^,]+]], i8** [[BPADDR2]] +// CHECK-DAG: store i8* [[P2:%[^,]+]], i8** [[PADDR2]] + +// CHECK-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 3 +// CHECK-DAG: [[PADDR3:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 3 +// CHECK-DAG: store i8* [[BP3:%[^,]+]], i8** [[BPADDR3]] +// CHECK-DAG: store i8* [[P3:%[^,]+]], i8** [[PADDR3]] +// CHECK-DAG: [[BP3]] = bitcast [10 x i32]* %{{.+}} to i8* +// CHECK-DAG: [[P3]] = bitcast [10 x i32]* %{{.+}} to i8* + +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 +// CHECK-NEXT: br label %[[IFEND:.+]] + +// CHECK: [[IFELSE]] +// CHECK: store i32 -1, i32* [[RHV]], align 4 +// CHECK-NEXT: br label %[[IFEND:.+]] + +// CHECK: [[IFEND]] +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// CHECK: [[FAIL]] +// CHECK: call void [[HVT6:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}) +// CHECK-NEXT: br label %[[END]] +// CHECK: [[END]] + +// +// CHECK: define {{.*}}[[FTEMPLATE]] +// +// CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 40 +// CHECK: br i1 [[IF]], label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]] +// CHECK: [[IFTHEN]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* getelementptr inbounds ([3 x i[[SZ]]], [3 x i[[SZ]]]* [[SIZET5]], i32 0, i32 0), i32* getelementptr inbounds ([3 x i32], [3 x i32]* [[MAPT5]], i32 0, i32 0), i32 0, i32 0) +// CHECK-DAG: [[BPR]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP:%.+]], i32 0, i32 0 +// CHECK-DAG: [[PR]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P:%.+]], i32 0, i32 0 + +// CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 0 +// CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 0 +// CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] +// CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] +// CHECK-DAG: [[BP0]] = inttoptr i[[SZ]] [[VAL0:%.+]] to i8* +// CHECK-DAG: [[P0]] = inttoptr i[[SZ]] [[VAL0]] to i8* + +// CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 1 +// CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 1 +// CHECK-DAG: store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]] +// CHECK-DAG: store i8* [[P1:%[^,]+]], i8** [[PADDR1]] +// CHECK-DAG: [[BP1]] = inttoptr i[[SZ]] [[VAL1:%.+]] to i8* +// CHECK-DAG: [[P1]] = inttoptr i[[SZ]] [[VAL1]] to i8* + +// CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 2 +// CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 2 +// CHECK-DAG: store i8* [[BP2:%[^,]+]], i8** [[BPADDR2]] +// CHECK-DAG: store i8* [[P2:%[^,]+]], i8** [[PADDR2]] +// CHECK-DAG: [[BP2]] = bitcast [10 x i32]* %{{.+}} to i8* +// CHECK-DAG: [[P2]] = bitcast [10 x i32]* %{{.+}} to i8* + +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align 4 +// CHECK-NEXT: br label %[[IFEND:.+]] + +// CHECK: [[IFELSE]] +// CHECK: store i32 -1, i32* [[RHV]], align 4 +// CHECK-NEXT: br label %[[IFEND:.+]] + +// CHECK: [[IFEND]] +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align 4 +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// CHECK: [[FAIL]] +// CHECK: call void [[HVT5:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}) +// CHECK-NEXT: br label %[[END]] +// CHECK: [[END]] + + + +// Check that the offloading functions are emitted and that the arguments are +// correct and loaded correctly for the target regions of the callees of bar(). + +// CHECK: define internal void [[HVT7]] +// Create local storage for each capture. +// CHECK: [[LOCAL_THIS:%.+]] = alloca [[S1]]* +// CHECK: [[LOCAL_B:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_VLA1:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_VLA2:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_C:%.+]] = alloca i16* +// CHECK: [[LOCAL_B_CASTED:%.+]] = alloca i[[SZ]] +// CHECK-DAG: store [[S1]]* [[ARG_THIS:%.+]], [[S1]]** [[LOCAL_THIS]] +// CHECK-DAG: store i[[SZ]] [[ARG_B:%.+]], i[[SZ]]* [[LOCAL_B]] +// CHECK-DAG: store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]] +// CHECK-DAG: store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]] +// CHECK-DAG: store i16* [[ARG_C:%.+]], i16** [[LOCAL_C]] +// Store captures in the context. +// CHECK-DAG: [[REF_THIS:%.+]] = load [[S1]]*, [[S1]]** [[LOCAL_THIS]], +// CHECK-64-DAG:[[CONV_BP:%.+]] = bitcast i[[SZ]]* [[LOCAL_B]] to i32* +// CHECK-DAG: [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]], +// CHECK-DAG: [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]], +// CHECK-DAG: [[REF_C:%.+]] = load i16*, i16** [[LOCAL_C]], + +// CHECK-64-DAG:[[CONV_B:%.+]] = load i32, i32* [[CONV_BP]] +// CHECK-64-DAG:[[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_B_CASTED]] to i32* +// CHECK-64-DAG:store i32 [[CONV_B]], i32* [[CONV]], align +// CHECK-32-DAG:[[LOCAL_BV:%.+]] = load i32, i32* [[LOCAL_B]] +// CHECK-32-DAG:store i32 [[LOCAL_BV]], i32* [[LOCAL_B_CASTED]], align +// CHECK-DAG: [[REF_B:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_B_CASTED]], + +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [[S1]]*, i[[SZ]], i[[SZ]], i[[SZ]], i16*)* [[OMP_OUTLINED5:@.+]] to void (i32*, i32*, ...)*), [[S1]]* [[REF_THIS]], i[[SZ]] [[REF_B]], i[[SZ]] [[VAL_VLA1]], i[[SZ]] [[VAL_VLA2]], i16* [[REF_C]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED5]](i32* noalias %.global_tid., i32* noalias %.bound_tid., [[S1]]* %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i16* %{{.+}}) +// To reduce complexity, we're only going as far as validating the signature of the outlined parallel function. + + +// CHECK: define internal void [[HVT6]] +// Create local storage for each capture. +// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AAA:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]* +// CHECK: [[LOCAL_A_CASTED:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AA_CASTED:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AAA_CASTED:%.+]] = alloca i[[SZ]] +// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] +// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]] +// CHECK-DAG: store i[[SZ]] [[ARG_AAA:%.+]], i[[SZ]]* [[LOCAL_AAA]] +// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] +// Store captures in the context. +// CHECK-64-DAG:[[CONV_AP:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* +// CHECK-DAG: [[CONV_AAP:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16* +// CHECK-DAG: [[CONV_AAAP:%.+]] = bitcast i[[SZ]]* [[LOCAL_AAA]] to i8* +// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], + +// CHECK-64-DAG:[[CONV_A:%.+]] = load i32, i32* [[CONV_AP]] +// CHECK-64-DAG:[[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_A_CASTED]] to i32* +// CHECK-64-DAG:store i32 [[CONV_A]], i32* [[CONV]], align +// CHECK-32-DAG:[[LOCAL_AV:%.+]] = load i32, i32* [[LOCAL_A]] +// CHECK-32-DAG:store i32 [[LOCAL_AV]], i32* [[LOCAL_A_CASTED]], align +// CHECK-DAG: [[REF_A:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_A_CASTED]], + +// CHECK-DAG: [[CONV_AA:%.+]] = load i16, i16* [[CONV_AAP]] +// CHECK-DAG: [[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA_CASTED]] to i16* +// CHECK-DAG: store i16 [[CONV_AA]], i16* [[CONV]], align +// CHECK-DAG: [[REF_AA:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_AA_CASTED]], + +// CHECK-DAG: [[CONV_AAA:%.+]] = load i8, i8* [[CONV_AAAP]] +// CHECK-DAG: [[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_AAA_CASTED]] to i8* +// CHECK-DAG: store i8 [[CONV_AAA]], i8* [[CONV]], align +// CHECK-DAG: [[REF_AAA:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_AAA_CASTED]], + +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]], i[[SZ]], i[[SZ]], [10 x i32]*)* [[OMP_OUTLINED6:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[REF_A]], i[[SZ]] [[REF_AA]], i[[SZ]] [[REF_AAA]], [10 x i32]* [[REF_B]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED6]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x i32]* {{.+}}) +// To reduce complexity, we're only going as far as validating the signature of the outlined parallel function. + +// CHECK: define internal void [[HVT5]] +// Create local storage for each capture. +// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]* +// CHECK: [[LOCAL_A_CASTED:%.+]] = alloca i[[SZ]] +// CHECK: [[LOCAL_AA_CASTED:%.+]] = alloca i[[SZ]] +// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] +// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]] +// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] +// Store captures in the context. +// CHECK-64-DAG:[[CONV_AP:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* +// CHECK-DAG: [[CONV_AAP:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16* +// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], + +// CHECK-64-DAG:[[CONV_A:%.+]] = load i32, i32* [[CONV_AP]] +// CHECK-64-DAG:[[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_A_CASTED]] to i32* +// CHECK-64-DAG:store i32 [[CONV_A]], i32* [[CONV]], align +// CHECK-32-DAG:[[LOCAL_AV:%.+]] = load i32, i32* [[LOCAL_A]] +// CHECK-32-DAG:store i32 [[LOCAL_AV]], i32* [[LOCAL_A_CASTED]], align +// CHECK-DAG: [[REF_A:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_A_CASTED]], + +// CHECK-DAG: [[CONV_AA:%.+]] = load i16, i16* [[CONV_AAP]] +// CHECK-DAG: [[CONV:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA_CASTED]] to i16* +// CHECK-DAG: store i16 [[CONV_AA]], i16* [[CONV]], align +// CHECK-DAG: [[REF_AA:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_AA_CASTED]], + +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]], i[[SZ]], [10 x i32]*)* [[OMP_OUTLINED7:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[REF_A]], i[[SZ]] [[REF_AA]], [10 x i32]* [[REF_B]]) +// +// +// CHECK: define internal {{.*}}void [[OMP_OUTLINED7]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x i32]* {{.+}}) +// To reduce complexity, we're only going as far as validating the signature of the outlined parallel function. + +#endif diff --git a/test/OpenMP/target_teams_codegen_registration.cpp b/test/OpenMP/target_teams_codegen_registration.cpp new file mode 100644 index 000000000000..f65c701d7a66 --- /dev/null +++ b/test/OpenMP/target_teams_codegen_registration.cpp @@ -0,0 +1,437 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// Test target teams codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// Check that no target code is emmitted if no omptests flag was provided. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK-DAG: [[SA:%.+]] = type { [4 x i32] } +// CHECK-DAG: [[SB:%.+]] = type { [8 x i32] } +// CHECK-DAG: [[SC:%.+]] = type { [16 x i32] } +// CHECK-DAG: [[SD:%.+]] = type { [32 x i32] } +// CHECK-DAG: [[SE:%.+]] = type { [64 x i32] } +// CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] } +// CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] } +// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 } +// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* } +// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* } + +// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 } + +// CHECK-DAG: [[A1:@.+]] = internal global [[SA]] +// CHECK-DAG: [[A2:@.+]] = global [[SA]] +// CHECK-DAG: [[B1:@.+]] = global [[SB]] +// CHECK-DAG: [[B2:@.+]] = global [[SB]] +// CHECK-DAG: [[C1:@.+]] = internal global [[SC]] +// CHECK-DAG: [[D1:@.+]] = global [[SD]] +// CHECK-DAG: [[E1:@.+]] = global [[SE]] +// CHECK-DAG: [[T1:@.+]] = global [[ST1]] +// CHECK-DAG: [[T2:@.+]] = global [[ST2]] + +// CHECK-NTARGET-DAG: [[SA:%.+]] = type { [4 x i32] } +// CHECK-NTARGET-DAG: [[SB:%.+]] = type { [8 x i32] } +// CHECK-NTARGET-DAG: [[SC:%.+]] = type { [16 x i32] } +// CHECK-NTARGET-DAG: [[SD:%.+]] = type { [32 x i32] } +// CHECK-NTARGET-DAG: [[SE:%.+]] = type { [64 x i32] } +// CHECK-NTARGET-DAG: [[ST1:%.+]] = type { [228 x i32] } +// CHECK-NTARGET-DAG: [[ST2:%.+]] = type { [1128 x i32] } +// CHECK-NTARGET-NOT: type { i8*, i8*, % +// CHECK-NTARGET-NOT: type { i32, % + +// We have 7 target regions + +// CHECK-DAG: {{@.+}} = private constant i8 0 +// TCHECK-NOT: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] +// CHECK-DAG: {{@.+}} = private constant i8 0 +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4] +// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288] + +// CHECK-NTARGET-NOT: private constant i8 0 +// CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i + +// CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00" +// CHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00" +// CHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00" +// CHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00" +// CHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00" +// CHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00" +// CHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00" +// CHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00" +// CHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00" +// CHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00" +// CHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00" +// CHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00" +// CHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 + +// TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00" +// TCHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00" +// TCHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00" +// TCHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00" +// TCHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00" +// TCHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00" +// TCHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00" +// TCHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00" +// TCHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00" +// TCHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00" +// TCHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00" +// TCHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +// TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00" +// TCHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 + +// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]] +// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]] +// CHECK: [[DEVBEGIN:@.+]] = external constant i8 +// CHECK: [[DEVEND:@.+]] = external constant i8 +// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }] +// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] } + +// We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function. +// CHECK: @llvm.global_ctors = appending global [4 x { i32, void ()*, i8* }] [ +// CHECK-SAME: { i32, void ()*, i8* } { i32 500, void ()* [[P500:@[^,]+]], i8* null }, +// CHECK-SAME: { i32, void ()*, i8* } { i32 501, void ()* [[P501:@[^,]+]], i8* null }, +// CHECK-SAME: { i32, void ()*, i8* } { i32 65535, void ()* [[PMAX:@[^,]+]], i8* null }, +// CHECK-SAME: { i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* [[REGFN:@.+]] to void ()*), i8* null }] + +// CHECK-NTARGET: @llvm.global_ctors = appending global [3 x { i32, void ()*, i8* }] [ + +extern int *R; + +struct SA { + int arr[4]; + void foo() { + int a = *R; + a += 1; + *R = a; + } + SA() { + int a = *R; + a += 2; + *R = a; + } + ~SA() { + int a = *R; + a += 3; + *R = a; + } +}; + +struct SB { + int arr[8]; + void foo() { + int a = *R; + #pragma omp target teams + a += 4; + *R = a; + } + SB() { + int a = *R; + a += 5; + *R = a; + } + ~SB() { + int a = *R; + a += 6; + *R = a; + } +}; + +struct SC { + int arr[16]; + void foo() { + int a = *R; + a += 7; + *R = a; + } + SC() { + int a = *R; + #pragma omp target teams + a += 8; + *R = a; + } + ~SC() { + int a = *R; + a += 9; + *R = a; + } +}; + +struct SD { + int arr[32]; + void foo() { + int a = *R; + a += 10; + *R = a; + } + SD() { + int a = *R; + a += 11; + *R = a; + } + ~SD() { + int a = *R; + #pragma omp target teams + a += 12; + *R = a; + } +}; + +struct SE { + int arr[64]; + void foo() { + int a = *R; + #pragma omp target teams if(target: 0) + a += 13; + *R = a; + } + SE() { + int a = *R; + #pragma omp target teams + a += 14; + *R = a; + } + ~SE() { + int a = *R; + #pragma omp target teams + a += 15; + *R = a; + } +}; + +template <int x> +struct ST { + int arr[128 + x]; + void foo() { + int a = *R; + #pragma omp target teams + a += 16 + x; + *R = a; + } + ST() { + int a = *R; + #pragma omp target teams + a += 17 + x; + *R = a; + } + ~ST() { + int a = *R; + #pragma omp target teams + a += 18 + x; + *R = a; + } +}; + +// We have to make sure we us all the target regions: +//CHECK-DAG: define internal void @[[NAME1]]( +//CHECK-DAG: call void @[[NAME1]]( +//CHECK-DAG: define internal void @[[NAME2]]( +//CHECK-DAG: call void @[[NAME2]]( +//CHECK-DAG: define internal void @[[NAME3]]( +//CHECK-DAG: call void @[[NAME3]]( +//CHECK-DAG: define internal void @[[NAME4]]( +//CHECK-DAG: call void @[[NAME4]]( +//CHECK-DAG: define internal void @[[NAME5]]( +//CHECK-DAG: call void @[[NAME5]]( +//CHECK-DAG: define internal void @[[NAME6]]( +//CHECK-DAG: call void @[[NAME6]]( +//CHECK-DAG: define internal void @[[NAME7]]( +//CHECK-DAG: call void @[[NAME7]]( +//CHECK-DAG: define internal void @[[NAME8]]( +//CHECK-DAG: call void @[[NAME8]]( +//CHECK-DAG: define internal void @[[NAME9]]( +//CHECK-DAG: call void @[[NAME9]]( +//CHECK-DAG: define internal void @[[NAME10]]( +//CHECK-DAG: call void @[[NAME10]]( +//CHECK-DAG: define internal void @[[NAME11]]( +//CHECK-DAG: call void @[[NAME11]]( +//CHECK-DAG: define internal void @[[NAME12]]( +//CHECK-DAG: call void @[[NAME12]]( + +//TCHECK-DAG: define void @[[NAME1]]( +//TCHECK-DAG: define void @[[NAME2]]( +//TCHECK-DAG: define void @[[NAME3]]( +//TCHECK-DAG: define void @[[NAME4]]( +//TCHECK-DAG: define void @[[NAME5]]( +//TCHECK-DAG: define void @[[NAME6]]( +//TCHECK-DAG: define void @[[NAME7]]( +//TCHECK-DAG: define void @[[NAME8]]( +//TCHECK-DAG: define void @[[NAME9]]( +//TCHECK-DAG: define void @[[NAME10]]( +//TCHECK-DAG: define void @[[NAME11]]( +//TCHECK-DAG: define void @[[NAME12]]( + +// CHECK-NTARGET-NOT: __tgt_target +// CHECK-NTARGET-NOT: __tgt_register_lib +// CHECK-NTARGET-NOT: __tgt_unregister_lib + +// TCHECK-NOT: __tgt_target +// TCHECK-NOT: __tgt_register_lib +// TCHECK-NOT: __tgt_unregister_lib + +// We have 2 initializers with priority 500 +//CHECK: define internal void [[P500]]( +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK-NOT: call void @{{.+}}() +//CHECK: ret void + +// We have 1 initializers with priority 501 +//CHECK: define internal void [[P501]]( +//CHECK: call void @{{.+}}() +//CHECK-NOT: call void @{{.+}}() +//CHECK: ret void + +// We have 6 initializers with default priority +//CHECK: define internal void [[PMAX]]( +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK: call void @{{.+}}() +//CHECK-NOT: call void @{{.+}}() +//CHECK: ret void + +// Check registration and unregistration + +//CHECK: define internal void [[UNREGFN:@.+]](i8*) +//CHECK: call i32 @__tgt_unregister_lib([[DSCTY]]* [[DESC]]) +//CHECK: ret void +//CHECK: declare i32 @__tgt_unregister_lib([[DSCTY]]*) + +//CHECK: define internal void [[REGFN]](i8*) +//CHECK: call i32 @__tgt_register_lib([[DSCTY]]* [[DESC]]) +//CHECK: call i32 @__cxa_atexit(void (i8*)* [[UNREGFN]], i8* bitcast ([[DSCTY]]* [[DESC]] to i8*), +//CHECK: ret void +//CHECK: declare i32 @__tgt_register_lib([[DSCTY]]*) + +static __attribute__((init_priority(500))) SA a1; +SA a2; +SB __attribute__((init_priority(500))) b1; +SB __attribute__((init_priority(501))) b2; +static SC c1; +SD d1; +SE e1; +ST<100> t1; +ST<1000> t2; + + +int bar(int a){ + int r = a; + + a1.foo(); + a2.foo(); + b1.foo(); + b2.foo(); + c1.foo(); + d1.foo(); + e1.foo(); + t1.foo(); + t2.foo(); + + #pragma omp target teams + ++r; + + return r + *R; +} + +// Check metadata is properly generated: +// CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 {{[0-9]+}}} + +// TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 {{[0-9]+}}} + +#endif diff --git a/test/OpenMP/target_teams_codegen_registration_naming.cpp b/test/OpenMP/target_teams_codegen_registration_naming.cpp new file mode 100644 index 000000000000..fb4232897282 --- /dev/null +++ b/test/OpenMP/target_teams_codegen_registration_naming.cpp @@ -0,0 +1,66 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// Test target teams codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK: [[CA:%.+]] = type { i32* } + +// CHECK: define {{.*}}i32 @[[NNAME:.+]](i32 {{.*}}%{{.+}}) +int nested(int a){ + // CHECK: call void @__omp_offloading_[[FILEID:[0-9a-f]+_[0-9a-f]+]]_[[NNAME]]_l[[T1L:[0-9]+]]( + #pragma omp target teams + ++a; + + // CHECK: call void @"[[LNAME:.+]]"([[CA]]* + auto F = [&](){ + #pragma omp parallel + { + #pragma omp target teams + ++a; + } + }; + + F(); + + return a; +} + +// CHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T1L]]( +// TCHECK: define {{.*}}void @__omp_offloading_[[FILEID:[0-9a-f]+_[0-9a-f]+]]_[[NNAME:.+]]_l[[T1L:[0-9]+]]( + +// CHECK: define {{.*}}void @"[[LNAME]]"( +// CHECK: call void {{.*}}@__kmpc_fork_call{{.+}}[[PNAME:@.+]] to + +// CHECK: define {{.*}}void [[PNAME]]( +// CHECK: call void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T2L:[0-9]+]]( + +// CHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T2L]]( +// TCHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME:.+]]_l[[T2L:[0-9]+]]( + + +// Check metadata is properly generated: +// CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 {{[0-9]+}}} + +// TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 {{[0-9]+}}} +#endif diff --git a/test/OpenMP/target_teams_num_teams_codegen.cpp b/test/OpenMP/target_teams_num_teams_codegen.cpp new file mode 100644 index 000000000000..6fcfa38e9bd3 --- /dev/null +++ b/test/OpenMP/target_teams_num_teams_codegen.cpp @@ -0,0 +1,344 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK-DAG: %ident_t = type { i32, i32, i32, i32, i8* } +// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00" +// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) } + +// CHECK-DAG: [[S1:%.+]] = type { double } +// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 } +// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* } +// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* } + +// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 } + +// We have 6 target regions + +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 + +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] + +// Check if offloading descriptor is created. +// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]] +// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]] +// CHECK: [[DEVBEGIN:@.+]] = external constant i8 +// CHECK: [[DEVEND:@.+]] = external constant i8 +// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }] +// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] } + +// Check target registration is registered as a Ctor. +// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* [[REGFN:@.+]] to void ()*), i8* null }] + + +template<typename tx> +tx ftemplate(int n) { + tx a = 0; + + #pragma omp target teams num_teams(tx(20)) + { + } + + short b = 1; + #pragma omp target teams num_teams(b) + { + a += b; + } + + return a; +} + +static +int fstatic(int n) { + + #pragma omp target teams num_teams(n) + { + } + + #pragma omp target teams num_teams(32+n) + { + } + + return n+1; +} + +struct S1 { + double a; + + int r1(int n){ + int b = 1; + + #pragma omp target teams num_teams(n-b) + { + this->a = (double)b + 1.5; + } + + #pragma omp target teams num_teams(1024) + { + this->a = 2.5; + } + + return (int)a; + } +}; + +// CHECK: define {{.*}}@{{.*}}bar{{.*}} +int bar(int n){ + int a = 0; + + S1 S; + // CHECK: call {{.*}}i32 [[FS1:@.+]]([[S1]]* {{.*}}, i32 {{.*}}) + a += S.r1(n); + + // CHECK: call {{.*}}i32 [[FSTATIC:@.+]](i32 {{.*}}) + a += fstatic(n); + + // CHECK: call {{.*}}i32 [[FTEMPLATE:@.+]](i32 {{.*}}) + a += ftemplate<int>(n); + + return a; +} + + + +// +// CHECK: define {{.*}}[[FS1]]([[S1]]* {{%.+}}, i32 {{[^%]*}}[[PARM:%.+]]) +// +// CHECK-DAG: store i32 [[PARM]], i32* [[N_ADDR:%.+]], align +// CHECK: store i32 1, i32* [[B:%.+]], align +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[BV:%.+]] = load i32, i32* [[B]], align +// CHECK: [[SUB:%.+]] = sub nsw i32 [[NV]], [[BV]] +// CHECK: store i32 [[SUB]], i32* [[CAPE_ADDR:%.+]], align +// CHECK: [[CEV:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i32* +// CHECK-64: store i32 [[CEV]], i32* [[CONV]], align +// CHECK-32: store i32 [[CEV]], i32* [[CAPEC_ADDR:%.+]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[TEAMS:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, {{.*}}, i32 [[TEAMS]], i32 0) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT1:@.+]]([[S1]]* {{%.+}}, i[[SZ]] {{%.+}}, i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// +// +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, {{.+}}, i32 1024, i32 0) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT2:@.+]]([[S1]]* {{[^,]+}}) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// + + + + + + +// +// CHECK: define {{.*}}[[FSTATIC]](i32 {{[^%]*}}[[PARM:%.+]]) +// +// CHECK-DAG: store i32 [[PARM]], i32* [[N_ADDR:%.+]], align +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: store i32 [[NV]], i32* [[CAPE_ADDR:%.+]], align +// CHECK: [[CEV:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i32* +// CHECK-64: store i32 [[CEV]], i32* [[CONV]], align +// CHECK-32: store i32 [[CEV]], i32* [[CAPEC_ADDR:%.+]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[TEAMS:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, {{.*}}, i32 [[TEAMS]], i32 0) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT3:@.+]](i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// +// +// +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[ADD:%.+]] = add nsw i32 32, [[NV]] +// CHECK: store i32 [[ADD]], i32* [[CAPE_ADDR:%.+]], align +// CHECK: [[CEV:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i32* +// CHECK-64: store i32 [[CEV]], i32* [[CONV]], align +// CHECK-32: store i32 [[CEV]], i32* [[CAPEC_ADDR:%.+]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[TEAMS:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, {{.*}}, i32 [[TEAMS]], i32 0) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT4:@.+]](i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// + + + + + + +// +// CHECK: define {{.*}}[[FTEMPLATE]] +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 0, {{.*}}, i32 20, i32 0) +// CHECK-NEXT: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK-NEXT: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT5:@.+]]() +// CHECK: br label {{%?}}[[END]] +// +// CHECK: [[END]] +// +// +// +// CHECK: store i16 1, i16* [[B:%.+]], align +// CHECK: [[BV:%.+]] = load i16, i16* [[B]], align +// CHECK: store i16 [[BV]], i16* [[CAPE_ADDR:%.+]], align +// CHECK: [[CEV:%.+]] = load i16, i16* [[CAPE_ADDR]], align +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i16* +// CHECK: store i16 [[CEV]], i16* [[CONV]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[T:%.+]] = load i16, i16* [[CAPE_ADDR]], align +// CHECK: [[TEAMS:%.+]] = sext i16 [[T]] to i32 +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, {{.*}}, i32 [[TEAMS]], i32 0) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT6:@.+]](i[[SZ]] {{%.+}}, i[[SZ]] {{%.+}}, i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// + + + + + + +// Check that the offloading functions are emitted and that the parallel function +// is appropriately guarded. + +// CHECK: define internal void [[HVT1]]([[S1]]* {{%.+}}, i[[SZ]] [[PARM1:%.+]], i[[SZ]] [[PARM2:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM2]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i32* +// CHECK-64: [[NT:%.+]] = load i32, i32* [[CONV]], align +// CHECK-32: [[NT:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 [[NT]], i32 0) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 2, +// +// + + +// CHECK: define internal void [[HVT2]]([[S1]]* {{%.+}}) +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 1024, i32 0) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 1, +// +// + + + + + + + + +// CHECK: define internal void [[HVT3]](i[[SZ]] [[PARM:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i32* +// CHECK-64: [[NT:%.+]] = load i32, i32* [[CONV]], align +// CHECK-32: [[NT:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 [[NT]], i32 0) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 0, +// +// +// CHECK: define internal void [[HVT4]](i[[SZ]] [[PARM:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i32* +// CHECK-64: [[NT:%.+]] = load i32, i32* [[CONV]], align +// CHECK-32: [[NT:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 [[NT]], i32 0) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 0, +// +// + + + + + +// CHECK: define internal void [[HVT5]]( +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 20, i32 0) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 0, +// +// + + +// CHECK: define internal void [[HVT6]](i[[SZ]] [[PARM1:%.+]], i[[SZ]] [[PARM2:%.+]], i[[SZ]] [[PARM3:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM3]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i16* +// CHECK: [[T:%.+]] = load i16, i16* [[CONV]], align +// CHECK: [[NT:%.+]] = sext i16 [[T]] to i32 +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 [[NT]], i32 0) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 2, +// +// + + + +#endif diff --git a/test/OpenMP/target_teams_thread_limit_codegen.cpp b/test/OpenMP/target_teams_thread_limit_codegen.cpp new file mode 100644 index 000000000000..b2fe1bf60e6f --- /dev/null +++ b/test/OpenMP/target_teams_thread_limit_codegen.cpp @@ -0,0 +1,357 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK-DAG: %ident_t = type { i32, i32, i32, i32, i8* } +// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00" +// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) } + +// CHECK-DAG: [[S1:%.+]] = type { double } +// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 } +// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* } +// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* } + +// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 } + +// We have 6 target regions + +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 + +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] +// TCHECK: @{{.+}} = constant [[ENTTY]] + +// Check if offloading descriptor is created. +// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]] +// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]] +// CHECK: [[DEVBEGIN:@.+]] = external constant i8 +// CHECK: [[DEVEND:@.+]] = external constant i8 +// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }] +// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] } + +// Check target registration is registered as a Ctor. +// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* [[REGFN:@.+]] to void ()*), i8* null }] + + +template<typename tx> +tx ftemplate(int n) { + tx a = 0; + + #pragma omp target teams thread_limit(tx(20)) + { + } + + short b = 1; + #pragma omp target teams num_teams(b) thread_limit(1024) + { + a += b; + } + + return a; +} + +static +int fstatic(int n) { + + #pragma omp target teams num_teams(n) thread_limit(n*32) + { + } + + #pragma omp target teams thread_limit(32+n) + { + } + + return n+1; +} + +struct S1 { + double a; + + int r1(int n){ + int b = 1; + + #pragma omp target teams thread_limit(n-b) + { + this->a = (double)b + 1.5; + } + + #pragma omp target teams thread_limit(1024) + { + this->a = 2.5; + } + + return (int)a; + } +}; + +// CHECK: define {{.*}}@{{.*}}bar{{.*}} +int bar(int n){ + int a = 0; + + S1 S; + // CHECK: call {{.*}}i32 [[FS1:@.+]]([[S1]]* {{.*}}, i32 {{.*}}) + a += S.r1(n); + + // CHECK: call {{.*}}i32 [[FSTATIC:@.+]](i32 {{.*}}) + a += fstatic(n); + + // CHECK: call {{.*}}i32 [[FTEMPLATE:@.+]](i32 {{.*}}) + a += ftemplate<int>(n); + + return a; +} + + + +// +// CHECK: define {{.*}}[[FS1]]([[S1]]* {{%.+}}, i32 {{[^%]*}}[[PARM:%.+]]) +// +// CHECK-DAG: store i32 [[PARM]], i32* [[N_ADDR:%.+]], align +// CHECK: store i32 1, i32* [[B:%.+]], align +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[BV:%.+]] = load i32, i32* [[B]], align +// CHECK: [[SUB:%.+]] = sub nsw i32 [[NV]], [[BV]] +// CHECK: store i32 [[SUB]], i32* [[CAPE_ADDR:%.+]], align +// CHECK: [[CEV:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i32* +// CHECK-64: store i32 [[CEV]], i32* [[CONV]], align +// CHECK-32: store i32 [[CEV]], i32* [[CAPEC_ADDR:%.+]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[TL:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, {{.*}}, i32 0, i32 [[TL]]) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT1:@.+]]([[S1]]* {{%.+}}, i[[SZ]] {{%.+}}, i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// +// +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, {{.+}}, i32 0, i32 1024) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT2:@.+]]([[S1]]* {{[^,]+}}) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// + + + + + + +// +// CHECK: define {{.*}}[[FSTATIC]](i32 {{[^%]*}}[[PARM:%.+]]) +// +// CHECK-DAG: store i32 [[PARM]], i32* [[N_ADDR:%.+]], align +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: store i32 [[NV]], i32* [[CAPE_ADDR1:%.+]], align +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[MUL:%.+]] = mul nsw i32 [[NV]], 32 +// CHECK: store i32 [[MUL]], i32* [[CAPE_ADDR2:%.+]], align +// CHECK: [[CEV:%.+]] = load i32, i32* [[CAPE_ADDR1]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR1:%.+]] to i32* +// CHECK-64: store i32 [[CEV]], i32* [[CONV]], align +// CHECK-32: store i32 [[CEV]], i32* [[CAPEC_ADDR1:%.+]], align +// CHECK: [[ARG1:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR1]], align +// CHECK: [[CEV:%.+]] = load i32, i32* [[CAPE_ADDR2]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR2:%.+]] to i32* +// CHECK-64: store i32 [[CEV]], i32* [[CONV]], align +// CHECK-32: store i32 [[CEV]], i32* [[CAPEC_ADDR2:%.+]], align +// CHECK: [[ARG2:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR2]], align +// CHECK: [[TEAMS:%.+]] = load i32, i32* [[CAPE_ADDR1]], align +// CHECK: [[TL:%.+]] = load i32, i32* [[CAPE_ADDR2]], align +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, {{.*}}, i32 [[TEAMS]], i32 [[TL]]) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT3:@.+]](i[[SZ]] [[ARG1]], i[[SZ]] [[ARG2]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// +// +// +// CHECK: [[NV:%.+]] = load i32, i32* [[N_ADDR]], align +// CHECK: [[ADD:%.+]] = add nsw i32 32, [[NV]] +// CHECK: store i32 [[ADD]], i32* [[CAPE_ADDR:%.+]], align +// CHECK: [[CEV:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i32* +// CHECK-64: store i32 [[CEV]], i32* [[CONV]], align +// CHECK-32: store i32 [[CEV]], i32* [[CAPEC_ADDR:%.+]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[TL:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, {{.*}}, i32 0, i32 [[TL]]) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT4:@.+]](i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// + + + + + + +// +// CHECK: define {{.*}}[[FTEMPLATE]] +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 0, {{.*}}, i32 0, i32 20) +// CHECK-NEXT: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK-NEXT: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT5:@.+]]() +// CHECK: br label {{%?}}[[END]] +// +// CHECK: [[END]] +// +// +// +// CHECK: store i16 1, i16* [[B:%.+]], align +// CHECK: [[BV:%.+]] = load i16, i16* [[B]], align +// CHECK: store i16 [[BV]], i16* [[CAPE_ADDR:%.+]], align +// CHECK: [[CEV:%.+]] = load i16, i16* [[CAPE_ADDR]], align +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPEC_ADDR:%.+]] to i16* +// CHECK: store i16 [[CEV]], i16* [[CONV]], align +// CHECK: [[ARG:%.+]] = load i[[SZ]], i[[SZ]]* [[CAPEC_ADDR]], align +// CHECK: [[T:%.+]] = load i16, i16* [[CAPE_ADDR]], align +// CHECK: [[TEAMS:%.+]] = sext i16 [[T]] to i32 +// +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, {{.*}}, i32 [[TEAMS]], i32 1024) +// CHECK: store i32 [[RET]], i32* [[RHV:%.+]], align +// CHECK: [[RET2:%.+]] = load i32, i32* [[RHV]], align +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0 +// CHECK: br i1 [[ERROR]], label %[[FAIL:.+]], label %[[END:[^,]+]] +// +// CHECK: [[FAIL]] +// CHECK: call void [[HVT6:@.+]](i[[SZ]] {{%.+}}, i[[SZ]] {{%.+}}, i[[SZ]] [[ARG]]) +// CHECK: br label {{%?}}[[END]] +// CHECK: [[END]] +// + + + + + + +// Check that the offloading functions are emitted and that the parallel function +// is appropriately guarded. + +// CHECK: define internal void [[HVT1]]([[S1]]* {{%.+}}, i[[SZ]] [[PARM1:%.+]], i[[SZ]] [[PARM2:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM2]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i32* +// CHECK-64: [[TL:%.+]] = load i32, i32* [[CONV]], align +// CHECK-32: [[TL:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 0, i32 [[TL]]) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 2, +// +// + + +// CHECK: define internal void [[HVT2]]([[S1]]* {{%.+}}) +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 0, i32 1024) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 1, +// +// + + + + + + + + +// CHECK: define internal void [[HVT3]](i[[SZ]] [[PARM1:%.+]], i[[SZ]] [[PARM2:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM1]], i[[SZ]]* [[CAPE_ADDR1:%.+]], align +// CHECK-DAG: store i[[SZ]] [[PARM2]], i[[SZ]]* [[CAPE_ADDR2:%.+]], align +// CHECK-64: [[CONV1:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR1]] to i32* +// CHECK-64: [[CONV2:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR2]] to i32* +// CHECK-64: [[NT:%.+]] = load i32, i32* [[CONV1]], align +// CHECK-64: [[TL:%.+]] = load i32, i32* [[CONV2]], align +// CHECK-32: [[NT:%.+]] = load i32, i32* [[CAPE_ADDR1]], align +// CHECK-32: [[TL:%.+]] = load i32, i32* [[CAPE_ADDR2]], align +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 [[NT]], i32 [[TL]]) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 0, +// +// +// CHECK: define internal void [[HVT4]](i[[SZ]] [[PARM:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK-64: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i32* +// CHECK-64: [[TL:%.+]] = load i32, i32* [[CONV]], align +// CHECK-32: [[TL:%.+]] = load i32, i32* [[CAPE_ADDR]], align +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 0, i32 [[TL]]) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 0, +// +// + + + + + +// CHECK: define internal void [[HVT5]]( +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 0, i32 20) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 0, +// +// + + +// CHECK: define internal void [[HVT6]](i[[SZ]] [[PARM1:%.+]], i[[SZ]] [[PARM2:%.+]], i[[SZ]] [[PARM3:%.+]]) +// CHECK-DAG: store i[[SZ]] [[PARM3]], i[[SZ]]* [[CAPE_ADDR:%.+]], align +// CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[CAPE_ADDR]] to i16* +// CHECK: [[T:%.+]] = load i16, i16* [[CONV]], align +// CHECK: [[NT:%.+]] = sext i16 [[T]] to i32 +// CHECK: call i32 @__kmpc_push_num_teams(%ident_t* {{[^,]+}}, i32 {{[^,]+}}, i32 [[NT]], i32 1024) +// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 2, +// +// + + + +#endif diff --git a/test/OpenMP/threadprivate_codegen.cpp b/test/OpenMP/threadprivate_codegen.cpp index 318415761ac7..3785b70c19ba 100644 --- a/test/OpenMP/threadprivate_codegen.cpp +++ b/test/OpenMP/threadprivate_codegen.cpp @@ -176,7 +176,7 @@ struct S5 { // CHECK-TLS-DAG: [[ST_S4_ST:@.+]] = linkonce_odr thread_local global %struct.S4 zeroinitializer // CHECK-TLS-DAG: [[ST_S4_ST_GUARD:@_ZGVN2STI2S4E2stE]] = linkonce_odr thread_local global i64 0 // CHECK-TLS-DAG: @__tls_guard = internal thread_local global i8 0 -// CHECK-TLS-DAG: @__dso_handle = external global i8 +// CHECK-TLS-DAG: @__dso_handle = external hidden global i8 // CHECK-TLS-DAG: [[GS1_TLS_INIT:@_ZTHL3gs1]] = internal alias void (), void ()* @__tls_init // CHECK-TLS-DAG: [[ARR_X_TLS_INIT:@_ZTH5arr_x]] = alias void (), void ()* @__tls_init diff --git a/test/OpenMP/vla_crash.c b/test/OpenMP/vla_crash.c new file mode 100644 index 000000000000..50dcf068707b --- /dev/null +++ b/test/OpenMP/vla_crash.c @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -verify -triple powerpc64le-unknown-linux-gnu -fopenmp -x c -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics + +int a; + +// CHECK-LABEL: foo +void foo() { + int(*b)[a]; + int *(**c)[a]; + // CHECK: [[B:%.+]] = alloca i32*, + // CHECK: [[C:%.+]] = alloca i32***, + // CHECK: @__kmpc_global_thread_num + // CHECK: call void @__kmpc_serialized_parallel + // CHECK: call void [[OUTLINED:@[^(]+]](i32* %{{[^,]+}}, i32* %{{[^,]+}}, i64 %{{[^,]+}}, i32** [[B]], i64 %{{[^,]+}}, i32**** [[C]]) + // CHECK: call void @__kmpc_end_serialized_parallel + // CHECK: ret void +#pragma omp parallel if (0) + b[0][0] = c[0][a][0][a]; +} + +// CHECK: define internal void [[OUTLINED]](i32* {{[^,]+}}, i32* {{[^,]+}}, i64 {{[^,]+}}, i32** {{[^,]+}}, i64 {{[^,]+}}, i32**** {{[^,]+}}) + |