Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- void saxpy_parallel(int n, float a, float *x, float *restrict y){
- #pragma acc kernels
- // #pragma acc parallel loop
- for (int i = 0; i < n; ++i)
- y[i] = a*x[i] + y[i];
- }
- /* pgcc -I../common -acc -ta=nvidia -Minfo=accel -o laplace2d_acc laplace2d.c */
- // Step one
- #pragma omp parallel for shared(m, n, Anew, A)
- #pragma acc kernels
- // Step two
- #pragma acc data copy(A, Anew)
- #pragma omp parallel for shared(m, n, Anew, A)
- #pragma acc kernels
- // Step three
- /* The gang(32) clause on the outer loop tells the compiler to launch 32 blocks in the Y (row) direction. Thegang(16) clause on the inner loop tells it to launch 16 blocks in the X (column) direction. The vector(16) clause on the outer loop tells the compiler to use blocks that are 16 threads tall, thus processing the loop iterations in SIMD groups of 16. Finally, the vector(32) clause on the inner loop tells the compiler to use blocks that are 32 threads wide (one warp wide). */
- #pragma acc data copy(A), create(Anew)
- #pragma omp parallel for shared(m, n, Anew, A)
- #pragma acc kernels loop gang(32), vector(16)
- for( int j = 1; j < n-1; j++) {
- #pragma acc loop gang(16), vector(32)
- for( int i = 1; i < m-1; i++ )
- // Step four
- #pragma acc data copy(A), create(Anew)
- #pragma omp parallel for shared(m, n, Anew, A)
- #pragma acc kernels loop
- for( int j = 1; j < n-1; j++) {
- #pragma acc loop gang(16), vector(32)
- for( int i = 1; i < m-1; i++ )
- void gramSchmidt(restrict float Q[][COLS], const int rows, const int cols){
- float Qt[cols][rows];
- #pragma acc data create(Qt[cols][rows]) copy(Q[0:rows][0:cols])
- {
- //transpose Q in Qt
- #pragma acc parallel loop
- for(int i=0; i < rows; i++)
- for(int j=0; j < cols; j++)
- Qt[j][i] = Q[i][j];
- for(int k=0; k < cols; k++) {
- #pragma acc parallel
- {
- double tmp = 0.;
- #pragma acc loop vector reduction(+:tmp)
- for(int i=0; i < rows; i++) tmp += (Qt[k][i] * Qt[k][i]);
- tmp = sqrt(tmp);
- #pragma acc loop vector
- for(int i=0; i < rows; i++) Qt[k][i] /= tmp;
- }
- #pragma acc parallel loop vector_length(128)
- for(int j=k+1; j < cols; j++) {
- double tmp=0.;
- for(int i=0; i < rows; i++) tmp += Qt[k][i] * Qt[j][i];
- for(int i=0; i < rows; i++) Qt[j][i] -= tmp * Qt[k][i];
- }
- }
- #pragma acc parallel loop
- for(int i=0; i < rows; i++)
- for(int j=0; j < cols; j++)
- Q[i][j] = Qt[j][i];
- }
- }
- void multiplicaMatriz(restrict int *A, restrict int *B, int *C){
- #pragma acc data copyin(A[0:N*N], B[0:N*N]) copyout(C[0:N*N])
- {
- #pragma acc region
- {
- #pragma acc loop independent vector(16)
- for(int i=0;i<N;i++){
- #pragma acc loop independent vector(16)
- for(int j=0;j<N;j++){
- int sum=0;
- #pragma acc loop
- for(int k=0;k<N;k++){
- sum += + ( A[(i*N)+k] * B[(k*N)+j]);
- }
- C[(i*N)+j] = sum;
- }
- }
- }
- }
- }
- void par_impar(restrict int *a){
- #pragma acc data copy(a[0:N])
- {
- #pragma acc region
- {
- for(int j=0;j<N;j++){
- #pragma acc loop independent vector(16)
- for(int i=0;i<N;i=i+2){
- if(a[i] > a[i+1]){
- int swap = a[i];
- a[i] = a[i+1];
- a[i+1] = swap;
- }
- }
- #pragma acc loop independent vector(16)
- for(int i=1;i<N-1;i=i+2){
- if(a[i] > a[i+1]){
- int swap = a[i];
- a[i] = a[i+1];
- a[i+1] = swap;
- }
- }
- }
- }
- }
- }
- void matvecmul( float* x, float* a, float* v, int m, int n ){
- #pragma acc parallel loop gang
- for( int i = 0; i < m; ++i ){
- float xx = 0.0;
- #pragma acc loop worker reduction(+:xx)
- for( int j = 0; j < n; ++j )
- xx += a[i*n+j]*v[j];
- x[i] = xx;
- }
- }
- void matvecmul( float* x, float* a, float* v, int m, int n ){
- #pragma acc parallel loop gang copyin(a[0:n*m],v[0:n]) copyout(x[0:m])
- for( int i = 0; i < m; ++i ){
- float xx = 0.0;
- #pragma acc loop worker reduction(+:xx)
- for( int j = 0; j < n; ++j )
- xx += a[i*n+j]*v[j];
- x[i] = xx;
- }
- }
- void matvecmul( float* x, float* a, float* v, int m, int n ){
- #pragma acc parallel loop gang pcopyin(a[0:n*m],v[0:n]) pcopyout(x[0:m])
- for( int i = 0; i < m; ++i ){
- float xx = 0.0;
- #pragma acc loop worker reduction(+:xx)
- for( int j = 0; j < n; ++j )
- xx += a[i*n+j]*v[j];
- x[i] = xx;
- }
- }
- #pragma acc routine worker
- void matvec( float* v, float* x, float* a, int i, int n ){
- float xx = 0;
- #pragma acc loop reduction(+:xx)
- for( int j = 0; j < n; ++j )
- xx += a[i*n+j]*v[j];
- x[i] = xx;
- }
- #pragma acc data copy(a0[0:sz*sz*sz]), create(a1[0:sz*sz*sz], i,j,k,iter), copyin(sz,fac,n)
- {
- for (iter = 0; iter < ITERATIONS; iter++) {
- #pragma acc parallel loop
- for (i = 1; i < n+1; i++) {
- #pragma acc loop
- for (j = 1; j < n+1; j++) {
- #pragma acc loop
- for (k = 1; k < n+1; k++) {
Advertisement
Add Comment
Please, Sign In to add comment