openmp101 · 2018. 3. 15. · two ways to specify the number of threads 10 omp_set_num_threads(4);...
TRANSCRIPT
OpenMP 101Junwen Li
ACFSpringTrainingWorkshopMarch15-16,2018
Memorytypes
2
CPUMemory
CPUMemory
CPUMemory
CPUCoreMemoryCPUCore CPUCore
Network
Distributed Shared
MPIDevelopparallelprogramsfromtheverybeginning
OpenMPIncrementallyparallelizeserialprograms
computenode
... ...... ...
ComputingresourcesgoodforOpenMP
3
ComputingresourcesgoodforOpenMP
4
https://portal.tacc.utexas.edu/user-guides/stampede2
Model: IntelXeonPhi7250("KnightsLanding")
TotalcoresperKNLnode: 68coresonasinglesocket
Stampede2 hosts 4,200 KNL compute nodes
https://sciencenode.org/feature/autobahn-xsede-users.php
Wewilltalkabout...
5
q Fork/joinmodel
q OpenMP syntaxanddirectives• Helloworldexample• 𝜋calculation• Manyloopsornestedloops
q Realresearchexample• Opticalpropertiesofsolarcellmaterials• Fivelinestoparallelizethecode
Fork / join model of OpenMP framework
6
MasterThread
MasterThread
MasterThread
Thread
Thread
MasterThread
MasterThread
Thread
Thread
MasterThread
Ø Serialprogram
Ø OpenMP program
Serial“HelloWorld”inC/C++
7
#include<stdio.h>
int main(){int nthread =1;int ithread =1;printf(“Helloworldfrom%dof%d!\n”,ithread,nthread);
return0;}
$ ./hello_serial_c.xHello World from 1 of 1!
OpenMP “HelloWorld”inC/C++
8
#include<stdio.h>#include<omp.h>
int main(){omp_set_num_threads(4);
#pragmaomp parallel{int nthread =omp_get_num_threads();int ithread =omp_get_thread_num();printf(“Helloworldfrom%dof%d.\n”,ithread,nthread);
}return0;
}
$ ./hello_omp_c.xHello World from 0 of 4!Hello World from 3 of 4!Hello World from 1 of 4!Hello World from 2 of 4!
Threadindexstartswith0!
CompilerDirectives
RuntimeLibrary
HowtocompileOpenMP code
9
Twowaystospecifythenumberofthreads
10
omp_set_num_threads(4);
Insidethesourcecode
./hello_omp_c.xHello World from 1 of 6!Hello World from 0 of 6!Hello World from 3 of 6!Hello World from 2 of 6!Hello World from 5 of 6!Hello World from 4 of 6!Outsidethesourcecode
exportOMP_NUM_THREADS=4
$ ./hello_omp_c.xHello World from 0 of 4!Hello World from 3 of 4!Hello World from 1 of 4!Hello World from 2 of 4!
omp_set_num_threads(6);
exportOMP_NUM_THREADS=6
for ((n=1; n<=16; n++))doexport OMP_NUM_THREADS=$n/usr/bin/time -f "%e" OPENMP_CODE.EXEdone
ThreecorecomponentsinOpenMP
11
CompilerDirectives
#pragmaomp parallel
EnvironmentVariables
OMP_NUM_THREADS
RuntimeLibrary
omp_set_number_threads()
Directives
• addedtoaserialprogram• interpretedatcompiletime
Directives
• addedtoaserialprogram• executedatruntime
Directives
• addedaftercompiletime• usedatruntime
... ... ...
Example:𝝅 calculation
12
$4
1 + 𝑥) 𝑑𝑥 = 𝜋,
-
.4
1 + 𝑥/)Δ ≅ 𝜋
2
/3,
𝑥/ = 𝑖 − 0.5 Δ
Δ = 1/𝑁
#define NSTEPS 2000
int main(){
double pi, sum, step, x;long int i;
sum = 0;step = 1.0 / (double)NSTEPS;for (i=1; i<=NSTEPS; i++)
{x = step * ((double)i-0.5);sum += 4.0/(1.0+x*x);
}pi = sum * step;
printf("Pi is %4.12f\n", pi);
return 0;}
OpenMP versionof𝝅 calculation
13
#define NSTEPS 2000
int main(){
double pi, sum, step, x;long int i;
sum = 0;step = 1.0 / (double)NSTEPS;
#pragma omp parallel#pragma omp for private(i,x) reduction(+:sum)
for (i=1; i<=NSTEPS; i++){
x = step * ((double)i-0.5);sum += 4.0/(1.0+x*x);
}pi = sum * step;
printf("Pi is %4.12f\n", pi);
return 0;}
#define NSTEPS 2000
int main(){
double pi, sum, step, x;long int i;
sum = 0;step = 1.0 / (double)NSTEPS;
#pragma omp parallel for private(i,x) reduction(+:sum)for (i=1; i<=NSTEPS; i++)
{x = step * ((double)i-0.5);sum += 4.0/(1.0+x*x);
}pi = sum * step;
printf("Pi is %4.12f\n", pi);
return 0;}
Speedupvsnumberofthreads
14
NSTEPS=20000000000
forconstruct-- private
15
#define NSTEPS 2000
int main(){double pi, sum, step, x;long int i;
sum = 0;step = 1.0 / (double)NSTEPS;
#pragma omp parallel for private(i,x) reduction(+:sum)for (i=1; i<=NSTEPS; i++){x = step * ((double)i-0.5);sum += 4.0/(1.0+x*x);
}pi = sum * step;
printf("Pi is %4.12f\n", pi);
return 0;}
doublex0;longint i0;
for(i0=1;i0<=1000;i0++){x0=step*((double)i0- 0.5);...
}
doublex1;longint i1;
for(i1=1001;i1<=2000;i1++){x1=step*((double)i1- 0.5);...}
Thread0 Thread1OMP_NUM_THREADS=2
forconstruct-- reduction
16
#define NSTEPS 2000
int main(){double pi, sum, step, x;long int i;
sum = 0;step = 1.0 / (double)NSTEPS;
#pragma omp parallel for private(i,x) reduction(+:sum)for (i=1; i<=NSTEPS; i++){x = step * ((double)i-0.5);sum += 4.0/(1.0+x*x);
}pi = sum * step;
printf("Pi is %4.12f\n", pi);
return 0;}
Thread0 Thread1OMP_NUM_THREADS=2
sum+=sum0+sum1;
doublex0;longint i0;doublesum0=0;for(i0=1;i0<=1000;i0++){x0=step*((double)i0- 0.5);sum0+=4.0/(1.0+x0*x0);
}
doublex1;longint i1;doublesum1=0;for(i1=1001;i1<=2000;i1++){x1=step*((double)i1- 0.5);sum1+=4.0/(1.0+x1*x1);}
forconstruct-- reduction
17
#define NSTEPS 2000
int main(){double pi, sum, step, x;long int i;
sum = 0;step = 1.0 / (double)NSTEPS;
#pragma omp parallel for private(i,x) reduction(+:sum)for (i=1; i<=NSTEPS; i++){x = step * ((double)i-0.5);sum += 4.0/(1.0+x*x);
}pi = sum * step;
printf("Pi is %4.12f\n", pi);
return 0;}
Thread0 Thread1OMP_NUM_THREADS=2
sum+=sum0+sum1;
doublex0;longint i0;doublesum0=0;for(i0=1;i0<=1000;i0++){x0=step*((double)i0- 0.5);sum0+=4.0/(1.0+x0*x0);
}
doublex1;longint i1;doublesum1=0;for(i1=1001;i1<=2000;i1++){x1=step*((double)i1- 0.5);sum1+=4.0/(1.0+x1*x1);}
Staticloopiterationassignment
18
#pragma omp parallel for for (i = 0; i < 6; i++)
Thread0 Thread1OMP_NUM_THREADS=2
i =0,1,2 i =3,4,5
Thread0 Thread1OMP_NUM_THREADS=3
i =0,1 i =2,3
Thread2
i =4,5
Thread0 Thread1OMP_NUM_THREADS=4 Thread2 Thread3
Thread0 Thread1OMP_NUM_THREADS=5 Thread2 Thread3 Thread4
OMP_NUM_THREADS=6 ......
Staticloopiterationassignment
19
#pragma omp parallel for for (i = 0; i < 6; i++)
Thread0 Thread1OMP_NUM_THREADS=2
i =0,1,2 i =3,4,5
Thread0 Thread1OMP_NUM_THREADS=3
i =0,1 i =2,3
Thread2
i =4,5
Thread0 Thread1OMP_NUM_THREADS=4
i =0,1 i =2,3
Thread2
i =4
Thread3
i =5
Thread0 Thread1OMP_NUM_THREADS=5
i =0,1 i =2
Thread2
i =3
Thread3
i =4
Thread4
i =5
OMP_NUM_THREADS=6 ......
Staticloopiterationassignment
20
#include <stdio.h>#include <omp.h>
int main (){
int i, N;N = 6;
#pragma omp parallel{
#pragma omp forfor (i = 0; i < 6; i++)
printf("Loop 1: iter %3d, thread_id %3d\n", i, omp_get_thread_num());
#pragma omp forfor (i = 0; i < 6; i++)
printf("Loop 2: iter %3d, thread_id %3d\n", i, omp_get_thread_num());}
return 0;}
Loop 1: iter 0, thread_id 0Loop 1: iter 1, thread_id 1Loop 1: iter 2, thread_id 2Loop 1: iter 3, thread_id 3Loop 1: iter 4, thread_id 4Loop 1: iter 5, thread_id 5Loop 2: iter 0, thread_id 0Loop 2: iter 1, thread_id 1Loop 2: iter 2, thread_id 2Loop 2: iter 3, thread_id 3Loop 2: iter 4, thread_id 4Loop 2: iter 5, thread_id 5
Staticloopiterationassignment
21
export OMP_NUM_THREADS=6
Loop 1: iter 0, thread_id 0Loop 1: iter 1, thread_id 1Loop 1: iter 2, thread_id 2Loop 1: iter 3, thread_id 3Loop 1: iter 4, thread_id 4Loop 1: iter 5, thread_id 5Loop 2: iter 0, thread_id 0Loop 2: iter 1, thread_id 1Loop 2: iter 2, thread_id 2Loop 2: iter 3, thread_id 3Loop 2: iter 4, thread_id 4Loop 2: iter 5, thread_id 5
FirstRun SecondRun
Dynamicloopiterationassignment
22
#include <stdio.h>#include <omp.h>
int main (){
int i, N;N = 6;
#pragma omp parallel{
#pragma omp for schedule(dynamic,1)for (i = 0; i < 6; i++)
printf("Loop 1: iter %3d, thread_id %3d\n", i, omp_get_thread_num());
#pragma omp for schedule(dynamic,1)for (i = 0; i < 6; i++)
printf("Loop 2: iter %3d, thread_id %3d\n", i, omp_get_thread_num());}
return 0;}
exportOMP_SCHEDULE=“dynamic,1”
Dynamicloopiterationassignment
23
Loop 1: iter 0, thread_id 2Loop 1: iter 1, thread_id 0Loop 1: iter 2, thread_id 1Loop 1: iter 3, thread_id 2Loop 1: iter 4, thread_id 2Loop 1: iter 5, thread_id 2Loop 2: iter 0, thread_id 4Loop 2: iter 1, thread_id 3Loop 2: iter 2, thread_id 5Loop 2: iter 3, thread_id 2Loop 2: iter 4, thread_id 0Loop 2: iter 5, thread_id 1
export OMP_NUM_THREADS=6
Loop 1: iter 0, thread_id 0Loop 1: iter 1, thread_id 3Loop 1: iter 2, thread_id 2Loop 1: iter 3, thread_id 1Loop 1: iter 4, thread_id 4Loop 1: iter 5, thread_id 1Loop 2: iter 0, thread_id 5Loop 2: iter 1, thread_id 2Loop 2: iter 2, thread_id 1Loop 2: iter 3, thread_id 3Loop 2: iter 4, thread_id 4Loop 2: iter 5, thread_id 0
FirstRun SecondRun
Staticschedule– fairnessintermsofiterations
24
Thread0 Thread1 Thread2
1234
5678
9101112
Thread0 Thread1 Thread2
15678
9101112
2
3
4
Compu
tingtim
e
Dynamicschedule– fairnessintermsofload
25
Thread0 Thread1 Thread2 Thread0 Thread1 Thread2
15678
9101112
2
3
4
1 2 3
Compu
tingtim
e
exportOMP_SCHEDULE=“dynamic,1”
Dynamicschedule– fairnessintermsofload
26
Thread0 Thread1 Thread2 Thread0 Thread1 Thread2
15678
9101112
2
3
4
1 2 3
45 6
Compu
tingtim
e
exportOMP_SCHEDULE=“dynamic,1”
Dynamicschedule– fairnessintermsofload
27
Thread0 Thread1 Thread2 Thread0 Thread1 Thread2
15678
9101112
2
3
4
1 2 3
45 67 8
Compu
tingtim
e
exportOMP_SCHEDULE=“dynamic,1”
Dynamicschedule– fairnessintermsofload
28
Thread0 Thread1 Thread2 Thread0 Thread1 Thread2
15678
9101112
2
3
4
1 2 3
45 67 89 10
Compu
tingtim
e
exportOMP_SCHEDULE=“dynamic,1”
Dynamicschedule– fairnessintermsofload
29
Thread0 Thread1 Thread2 Thread0 Thread1 Thread2
15678
9101112
2
3
4
1 2 3
45 67 89 10
11 12
Compu
tingtim
e
exportOMP_SCHEDULE=“dynamic,1”
Dynamicschedule– fairnessintermsofload
30
Thread0 Thread1 Thread2 Thread0 Thread1 Thread2
15678910
1112
2
3
4
1 2 3
45 67 89 10
11 12
Compu
tingtim
e
exportOMP_SCHEDULE=“dynamic,1” exportOMP_SCHEDULE=“dynamic,2”
Whentousestaticschedule
31
#include <stdio.h>#include <omp.h>
int main (){int i, N;N = 6;
#pragma omp parallel{#pragma omp forfor (i = 0; i < 6; i++){printf("Loop 1: iter %3d, thread_id %3d\n", i, omp_get_thread_num());do something on a[i];......
}
#pragma omp forfor (i = 0; i < 6; i++)printf("Loop 2: iter %3d, thread_id %3d\n", i, omp_get_thread_num());do something else on a[i];......
}
return 0;}
Loop 1: iter 0, thread_id 0Loop 1: iter 1, thread_id 1Loop 1: iter 2, thread_id 2Loop 1: iter 3, thread_id 3Loop 1: iter 4, thread_id 4Loop 1: iter 5, thread_id 5Loop 2: iter 0, thread_id 0Loop 2: iter 1, thread_id 1Loop 2: iter 2, thread_id 2Loop 2: iter 3, thread_id 3Loop 2: iter 4, thread_id 4Loop 2: iter 5, thread_id 5
Whentousedynamicschedule
32
#include <stdio.h>#include <omp.h>
int main (){int i, N;N = 6;
#pragma omp parallel{#pragma omp for schedule(dynamic,1)for (i = 0; i < 6; i++){printf("Loop 1: iter %3d, thread_id %3d\n", i, omp_get_thread_num());if (i == ...){}
}
#pragma omp for schedule(dynamic,1)for (i = 0; i < 6; i++)printf("Loop 2: iter %3d, thread_id %3d\n", i, omp_get_thread_num());if (i == ...){}
}
return 0;}
Avoidparallelregionsininnerloops
33
for(i=0;i<n;i++)for(j=0;j<n;j++)#pragmaomp parallel#pragmaomp forfor(k=0;k<n;k++){......}
#pragmaomp parallelfor(i=0;i<n;i++)for(j=0;j<n;j++)#pragmaomp forfor(k=0;k<n;k++){......}
overheadsareincurred𝑛) times Parallelconstructoverheadsareminimized
Collapsenestedloops
34
for(i=0;i<n;i++)for(j=0;j<n;j++)for(k=0;k<n;k++){......}
#pragmaomp parallelforcollapse(3)for(i=0;i<n;i++)for(j=0;j<n;j++)for(k=0;k<n;k++){......}
#pragmaomp parallelforfor(ijk=0;ijk<n*n*n;ijk++){......}
Equivalentto
Collapsenestedloops
35
for(i=0;i<n;i++)for(j=0;j<n;j++)for(k=0;k<n;k++){......}
#pragmaomp parallelforcollapse(3)for(i=0;i<n;i++)for(j=0;j<n;j++)for(k=0;k<n;k++){......}
for(i=0;i<n;i++)x=i;for(j=0;j<n;j++)for(k=0;k<n;k++){......}
#pragmaomp parallelforfor(ijk=0;ijk<n*n*n;ijk++){......}
Equivalentto
Collapsenestedloops
36
for(i=0;i<n;i++)for(j=0;j<n;j++)for(k=0;k<n;k++){......}
#pragmaomp parallelforcollapse(3)for(i=0;i<n;i++)for(j=0;j<n;j++)for(k=0;k<n;k++){......}
for(i=0;i<n;i++)x=i;for(j=0;j<n;j++)for(k=0;k<n;k++){......}
#pragmaomp parallelforfor(ijk=0;ijk<n*n*n;ijk++){......}
Equivalentto
for_nested.c(13): error: parallel loops with collapse must be perfectly nestedx = i;^
Novelorganic-inorganicperovskiteAMX3solarcells
37
2009,3.8%
25%21%
CH3NH3PbI3
Organic Inorganic Perovskite
PerovskiteindicatesthestructureofcrystalslikeCaTiO3
PerovskitematerialsaregenerallyrepresentedasAMX3
A:Cation M:MetalCation X:Anion
CH3NH3HC(NH2)2
PbSn
IClBr
http://facserv.utk.edu/sustainability/on-campus/energy/ DatafromNationalRenewableEnergyLaboratory
Perovskitesunderillumination
38
𝐸(𝑘)
𝑘
Convergenceofdielectricfunctionwithrespecttok-gridsampling
39
2x2x2 5x5x5
10x10x10 20x20x20
4.41hoursfor
30x30x30
FivelinestogetOpenMP parallelizationoftheloopoverk points
40
useOMP_LIB.......
blockinteger :: ik, nbandtot, nbandvalreal (DP) :: xk(3), scissornbandtot = ctrl%get_nband_tot()nbandval = ctrl%get_nband_val()scissor = ctrl%get_scissor_shift()!$OMP PARALLEL PRIVATE(ik, xk, eigvec, eig, hk, dpl)allocate ( hk(num_wann, num_wann) )allocate ( eig(num_wann) )allocate ( eigvec(num_wann, num_wann) )allocate ( dpl(num_wann, num_wann, 3) )!$OMP DOdo ik=1,nkpt
xk = kg%get_xk(ik)if (OMP_GET_THREAD_NUM() == 0) then
write ( *, '(A,I15,A,I15)' ) "Working on ", ik, " / ", nkptend ifcall recipe_k(p_ham,p_dpl_x,p_dpl_y,p_dpl_z,xk,num_wann,hk,eigvec,eig,dpl)call dist_k(wg,delta,nbandtot,nbandval,eig,scissor,dpl,epsi_k(:,:,ik),epsr_k(:,:,ik))
end do!$OMP END DOdeallocate( hk, eig, eigvec, dpl )!$OMP END PARALLEL
end block
......
Timingfor30x30x30
41
Threads Time(seconds) Time(minutes) Time(hours)
1 15893.38 264.89 4.41
2 7963.32 132.72 2.21
4 4106.97 68.45 1.14
8 2171.96 36.20 0.60
16 1106.89 18.45 0.31
DemonstrateAmdahl’slawbyvaryingthenumberofk points
42
MasterThread
MasterThread
Thread
Thread
MasterThread
Loopoverk pointsLoadtheinputdata Savetheoutputdata
DemonstrateAmdahl’slawbyvaryingthenumberofk points
43
Summary
44
q OpenMP syntaxanddirectives• Helloworldexample• 𝜋calculation• Manyloopsornestedloops
q Demonstrationwithrealresearchexample• Fivelinestoparallelizethecode• Amdahl’slaw
Onlineresources
45
vOpenMPhttp://www.openmp.org/
vLawrenceLivermoreNationalLaboratoryhttps://computing.llnl.gov/tutorials/openMP
vDartmouthResearchComputinghttps://www.dartmouth.edu/~rc/classes/intro_openmp
vUTexasComputingCenterhttp://pages.tacc.utexas.edu/~eijkhout/pcse/html/index.html