optimizations

matheus__serpa

Oct 3rd, 2014

370

Never

Add comment

Not a member of Pastebin yet? Sign Up, it unlocks many cool features!

C 2.77 KB | None | 0 0

raw download clone embed print report

/* Optimize barrier use */
// A reduced number of barriers - Before reading the values of vectores a and b all updates on these vectors have to be completed. The barrier ensures this.
#pragma omp parallel default(shared) private(i)
{
#pragma omp for nowait
for(int i = 0; i < n; i++)
a[i] += b[i];
#pragma omp for nowait
for(int i = 0; i < n; i++)
c[i] += d[i];
#pragma omp barrier
#pragma omp for nowait reduction(+:sum)
for(i = 0; i < n; i++)
sum += a[i] + c[i];
}
/* Avoid the critical region construct */
// A critical region - Without the critical region, the first statement here leads to a data race. The second statement however involves private data only and unnecessarily increases the time taken to execute this construct. To improve performance it should be removed from the critical region.
#pragma omp parallel default(shared) private(c, d)
{
...
#pragma omp critical
{
a += 2 * c;
c = d * d;
}
}
/* Maximize parallel regions */
// Multiple combined parallel work-sharing loops - Each parallelized loop adds to the parallel overhead and has an implied barrier that cannot be omitted.
#pragma omp parallel for
for(...)
#pragma omp parallel for
for(...)
#pragma omp parallel for
for(...)
// Single parallel region enclosing all work-sharing for loops - The cost of the parallel region is amortized over the various work-sharing loops.
#pragma omp parallel
{
#pragma omp for
for(...)
#pragma omp for
for(...)
#pragma omp for
for(...)
}
/* Avoid parallel regions in inner loops */
// Parallel region embedded in a loop nest - The overheads of the parallel region are incurred n² times.
for(int i = 0; i < n; i++)
for(int j = 0; j < n; j++)
#pragma omp parallel for
for(int k = 0; k < n; k++)
...
// Parallel region moved outside of the loop nest - The parallel construct overheads are minimized.
#pragma omp parallel
for(int i = 0; i < n; i++)
for(int j = 0; j < n; j++)
#pragma omp for
for(int k = 0; k < n; k++)
...
/* False sharing */
// Example of false sharing - Nthreads equals the number of threads executing the for-loop. The chunk size of 1 causes each thread to update one element of a, resulting in false sharing
int a[Nthreads];
#pragma omp parallel for default(shared) schedule(static, 1)
for(int i = 0; i < Nthreads; i++)
a[i] += i;
// When updates to an array are frequent, either work with local copies of the array in stead of an array indexed by the threadID or Pad arrays so elements you use are on distinct cache lines.
int a[Nthreads][cache_line_size];
#pragma omp parallel for default(shared) schedule(static, 1)
for(int i = 0; i < Nthreads; i++)
a[i][0] += i;

Add Comment

Please, Sign In to add comment