openACC

matheus__serpa

Oct 6th, 2014

560

Never

Add comment

Not a member of Pastebin yet? Sign Up, it unlocks many cool features!

C 6.20 KB | None | 0 0

raw download clone embed print report

void saxpy_parallel(int n, float a, float *x, float *restrict y){
#pragma acc kernels
// #pragma acc parallel loop
for (int i = 0; i < n; ++i)
y[i] = a*x[i] + y[i];
}
/* pgcc -I../common -acc -ta=nvidia -Minfo=accel -o laplace2d_acc laplace2d.c */
// Step one
#pragma omp parallel for shared(m, n, Anew, A)
#pragma acc kernels
// Step two
#pragma acc data copy(A, Anew)
#pragma omp parallel for shared(m, n, Anew, A)
#pragma acc kernels
// Step three
/* The gang(32) clause on the outer loop tells the compiler to launch 32 blocks in the Y (row) direction. Thegang(16) clause on the inner loop tells it to launch 16 blocks in the X (column) direction. The vector(16) clause on the outer loop tells the compiler to use blocks that are 16 threads tall, thus processing the loop iterations in SIMD groups of 16. Finally, the vector(32) clause on the inner loop tells the compiler to use blocks that are 32 threads wide (one warp wide). */
#pragma acc data copy(A), create(Anew)
#pragma omp parallel for shared(m, n, Anew, A)
#pragma acc kernels loop gang(32), vector(16)
for( int j = 1; j < n-1; j++) {
#pragma acc loop gang(16), vector(32)
for( int i = 1; i < m-1; i++ )
// Step four
#pragma acc data copy(A), create(Anew)
#pragma omp parallel for shared(m, n, Anew, A)
#pragma acc kernels loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop gang(16), vector(32)
for( int i = 1; i < m-1; i++ )
void gramSchmidt(restrict float Q[][COLS], const int rows, const int cols){
float Qt[cols][rows];
#pragma acc data create(Qt[cols][rows]) copy(Q[0:rows][0:cols])
{
//transpose Q in Qt
#pragma acc parallel loop
for(int i=0; i < rows; i++)
for(int j=0; j < cols; j++)
Qt[j][i] = Q[i][j];
for(int k=0; k < cols; k++) {
#pragma acc parallel
{
double tmp = 0.;
#pragma acc loop vector reduction(+:tmp)
for(int i=0; i < rows; i++) tmp += (Qt[k][i] * Qt[k][i]);
tmp = sqrt(tmp);
#pragma acc loop vector
for(int i=0; i < rows; i++) Qt[k][i] /= tmp;
}
#pragma acc parallel loop vector_length(128)
for(int j=k+1; j < cols; j++) {
double tmp=0.;
for(int i=0; i < rows; i++) tmp += Qt[k][i] * Qt[j][i];
for(int i=0; i < rows; i++) Qt[j][i] -= tmp * Qt[k][i];
}
}
#pragma acc parallel loop
for(int i=0; i < rows; i++)
for(int j=0; j < cols; j++)
Q[i][j] = Qt[j][i];
}
}
void multiplicaMatriz(restrict int *A, restrict int *B, int *C){
#pragma acc data copyin(A[0:N*N], B[0:N*N]) copyout(C[0:N*N])
{
#pragma acc region
{
#pragma acc loop independent vector(16)
for(int i=0;i<N;i++){
#pragma acc loop independent vector(16)
for(int j=0;j<N;j++){
int sum=0;
#pragma acc loop
for(int k=0;k<N;k++){
sum += + ( A[(i*N)+k] * B[(k*N)+j]);
}
C[(i*N)+j] = sum;
}
}
}
}
}
void par_impar(restrict int *a){
#pragma acc data copy(a[0:N])
{
#pragma acc region
{
for(int j=0;j<N;j++){
#pragma acc loop independent vector(16)
for(int i=0;i<N;i=i+2){
if(a[i] > a[i+1]){
int swap = a[i];
a[i] = a[i+1];
a[i+1] = swap;
}
}
#pragma acc loop independent vector(16)
for(int i=1;i<N-1;i=i+2){
if(a[i] > a[i+1]){
int swap = a[i];
a[i] = a[i+1];
a[i+1] = swap;
}
}
}
}
}
}
void matvecmul( float* x, float* a, float* v, int m, int n ){
#pragma acc parallel loop gang
for( int i = 0; i < m; ++i ){
float xx = 0.0;
#pragma acc loop worker reduction(+:xx)
for( int j = 0; j < n; ++j )
xx += a[i*n+j]*v[j];
x[i] = xx;
}
}
void matvecmul( float* x, float* a, float* v, int m, int n ){
#pragma acc parallel loop gang copyin(a[0:n*m],v[0:n]) copyout(x[0:m])
for( int i = 0; i < m; ++i ){
float xx = 0.0;
#pragma acc loop worker reduction(+:xx)
for( int j = 0; j < n; ++j )
xx += a[i*n+j]*v[j];
x[i] = xx;
}
}
void matvecmul( float* x, float* a, float* v, int m, int n ){
#pragma acc parallel loop gang pcopyin(a[0:n*m],v[0:n]) pcopyout(x[0:m])
for( int i = 0; i < m; ++i ){
float xx = 0.0;
#pragma acc loop worker reduction(+:xx)
for( int j = 0; j < n; ++j )
xx += a[i*n+j]*v[j];
x[i] = xx;
}
}
#pragma acc routine worker
void matvec( float* v, float* x, float* a, int i, int n ){
float xx = 0;
#pragma acc loop reduction(+:xx)
for( int j = 0; j < n; ++j )
xx += a[i*n+j]*v[j];
x[i] = xx;
}
#pragma acc data copy(a0[0:sz*sz*sz]), create(a1[0:sz*sz*sz], i,j,k,iter), copyin(sz,fac,n)
{
for (iter = 0; iter < ITERATIONS; iter++) {
#pragma acc parallel loop
for (i = 1; i < n+1; i++) {
#pragma acc loop
for (j = 1; j < n+1; j++) {
#pragma acc loop
for (k = 1; k < n+1; k++) {

Add Comment

Please, Sign In to add comment