Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /* Optimize barrier use */
- // A reduced number of barriers - Before reading the values of vectores a and b all updates on these vectors have to be completed. The barrier ensures this.
- #pragma omp parallel default(shared) private(i)
- {
- #pragma omp for nowait
- for(int i = 0; i < n; i++)
- a[i] += b[i];
- #pragma omp for nowait
- for(int i = 0; i < n; i++)
- c[i] += d[i];
- #pragma omp barrier
- #pragma omp for nowait reduction(+:sum)
- for(i = 0; i < n; i++)
- sum += a[i] + c[i];
- }
- /* Avoid the critical region construct */
- // A critical region - Without the critical region, the first statement here leads to a data race. The second statement however involves private data only and unnecessarily increases the time taken to execute this construct. To improve performance it should be removed from the critical region.
- #pragma omp parallel default(shared) private(c, d)
- {
- ...
- #pragma omp critical
- {
- a += 2 * c;
- c = d * d;
- }
- }
- /* Maximize parallel regions */
- // Multiple combined parallel work-sharing loops - Each parallelized loop adds to the parallel overhead and has an implied barrier that cannot be omitted.
- #pragma omp parallel for
- for(...)
- #pragma omp parallel for
- for(...)
- #pragma omp parallel for
- for(...)
- // Single parallel region enclosing all work-sharing for loops - The cost of the parallel region is amortized over the various work-sharing loops.
- #pragma omp parallel
- {
- #pragma omp for
- for(...)
- #pragma omp for
- for(...)
- #pragma omp for
- for(...)
- }
- /* Avoid parallel regions in inner loops */
- // Parallel region embedded in a loop nest - The overheads of the parallel region are incurred n² times.
- for(int i = 0; i < n; i++)
- for(int j = 0; j < n; j++)
- #pragma omp parallel for
- for(int k = 0; k < n; k++)
- ...
- // Parallel region moved outside of the loop nest - The parallel construct overheads are minimized.
- #pragma omp parallel
- for(int i = 0; i < n; i++)
- for(int j = 0; j < n; j++)
- #pragma omp for
- for(int k = 0; k < n; k++)
- ...
- /* False sharing */
- // Example of false sharing - Nthreads equals the number of threads executing the for-loop. The chunk size of 1 causes each thread to update one element of a, resulting in false sharing
- int a[Nthreads];
- #pragma omp parallel for default(shared) schedule(static, 1)
- for(int i = 0; i < Nthreads; i++)
- a[i] += i;
- // When updates to an array are frequent, either work with local copies of the array in stead of an array indexed by the threadID or Pad arrays so elements you use are on distinct cache lines.
- int a[Nthreads][cache_line_size];
- #pragma omp parallel for default(shared) schedule(static, 1)
- for(int i = 0; i < Nthreads; i++)
- a[i][0] += i;
Advertisement
Add Comment
Please, Sign In to add comment