openmp101 · 2018. 3. 15. · two ways to specify the number of threads 10 omp_set_num_threads(4);...

OpenMP 101Junwen Li

ACFSpringTrainingWorkshopMarch15-16,2018

Memorytypes

2

CPUMemory

CPUMemory

CPUMemory

CPUCoreMemoryCPUCore CPUCore

Network

Distributed Shared

MPIDevelopparallelprogramsfromtheverybeginning

OpenMPIncrementallyparallelizeserialprograms

computenode

... ...... ...

ComputingresourcesgoodforOpenMP

3

ComputingresourcesgoodforOpenMP

4

https://portal.tacc.utexas.edu/user-guides/stampede2

Model: IntelXeonPhi7250("KnightsLanding")

TotalcoresperKNLnode: 68coresonasinglesocket

Stampede2 hosts 4,200 KNL compute nodes

https://sciencenode.org/feature/autobahn-xsede-users.php

Wewilltalkabout...

5

q Fork/joinmodel

q OpenMP syntaxanddirectives• Helloworldexample• 𝜋calculation• Manyloopsornestedloops

q Realresearchexample• Opticalpropertiesofsolarcellmaterials• Fivelinestoparallelizethecode

Fork / join model of OpenMP framework

6

MasterThread

MasterThread

MasterThread

Thread

Thread

MasterThread

MasterThread

Thread

Thread

MasterThread

Ø Serialprogram

Ø OpenMP program

Serial“HelloWorld”inC/C++

7

#include<stdio.h>

int main(){int nthread =1;int ithread =1;printf(“Helloworldfrom%dof%d!\n”,ithread,nthread);

return0;}

$ ./hello_serial_c.xHello World from 1 of 1!

OpenMP “HelloWorld”inC/C++

8

#include<stdio.h>#include<omp.h>

int main(){omp_set_num_threads(4);

#pragmaomp parallel{int nthread =omp_get_num_threads();int ithread =omp_get_thread_num();printf(“Helloworldfrom%dof%d.\n”,ithread,nthread);

}return0;

}

$ ./hello_omp_c.xHello World from 0 of 4!Hello World from 3 of 4!Hello World from 1 of 4!Hello World from 2 of 4!

Threadindexstartswith0!

CompilerDirectives

RuntimeLibrary

HowtocompileOpenMP code

9

Twowaystospecifythenumberofthreads

10

omp_set_num_threads(4);

Insidethesourcecode

./hello_omp_c.xHello World from 1 of 6!Hello World from 0 of 6!Hello World from 3 of 6!Hello World from 2 of 6!Hello World from 5 of 6!Hello World from 4 of 6!Outsidethesourcecode

exportOMP_NUM_THREADS=4

$ ./hello_omp_c.xHello World from 0 of 4!Hello World from 3 of 4!Hello World from 1 of 4!Hello World from 2 of 4!

omp_set_num_threads(6);

exportOMP_NUM_THREADS=6

for ((n=1; n<=16; n++))doexport OMP_NUM_THREADS=$n/usr/bin/time -f "%e" OPENMP_CODE.EXEdone

ThreecorecomponentsinOpenMP

11

CompilerDirectives

#pragmaomp parallel

EnvironmentVariables

OMP_NUM_THREADS

RuntimeLibrary

omp_set_number_threads()

Directives

• addedtoaserialprogram• interpretedatcompiletime

Directives

• addedtoaserialprogram• executedatruntime

Directives

• addedaftercompiletime• usedatruntime

... ... ...

Example:𝝅 calculation

12

$4

1 + 𝑥) 𝑑𝑥 = 𝜋,

-

.4

1 + 𝑥/)Δ ≅ 𝜋

2

/3,

𝑥/ = 𝑖 − 0.5 Δ

Δ = 1/𝑁

#define NSTEPS 2000

int main(){

double pi, sum, step, x;long int i;

sum = 0;step = 1.0 / (double)NSTEPS;for (i=1; i<=NSTEPS; i++)

{x = step * ((double)i-0.5);sum += 4.0/(1.0+x*x);

}pi = sum * step;

printf("Pi is %4.12f\n", pi);

return 0;}

OpenMP versionof𝝅 calculation

13

#define NSTEPS 2000

int main(){


sum = 0;step = 1.0 / (double)NSTEPS;

#pragma omp parallel#pragma omp for private(i,x) reduction(+:sum)

for (i=1; i<=NSTEPS; i++){

x = step * ((double)i-0.5);sum += 4.0/(1.0+x*x);

}pi = sum * step;


return 0;}

#define NSTEPS 2000

int main(){



#pragma omp parallel for private(i,x) reduction(+:sum)for (i=1; i<=NSTEPS; i++)

{x = step * ((double)i-0.5);sum += 4.0/(1.0+x*x);

}pi = sum * step;


return 0;}

Speedupvsnumberofthreads

14

NSTEPS=20000000000

forconstruct-- private

15

#define NSTEPS 2000

int main(){double pi, sum, step, x;long int i;


#pragma omp parallel for private(i,x) reduction(+:sum)for (i=1; i<=NSTEPS; i++){x = step * ((double)i-0.5);sum += 4.0/(1.0+x*x);

}pi = sum * step;


return 0;}

doublex0;longint i0;

for(i0=1;i0<=1000;i0++){x0=step*((double)i0- 0.5);...

}

doublex1;longint i1;

for(i1=1001;i1<=2000;i1++){x1=step*((double)i1- 0.5);...}

Thread0 Thread1OMP_NUM_THREADS=2

forconstruct-- reduction

16

#define NSTEPS 2000




}pi = sum * step;


return 0;}


sum+=sum0+sum1;

doublex0;longint i0;doublesum0=0;for(i0=1;i0<=1000;i0++){x0=step*((double)i0- 0.5);sum0+=4.0/(1.0+x0*x0);

}

doublex1;longint i1;doublesum1=0;for(i1=1001;i1<=2000;i1++){x1=step*((double)i1- 0.5);sum1+=4.0/(1.0+x1*x1);}

forconstruct-- reduction

17

#define NSTEPS 2000




}pi = sum * step;


return 0;}


sum+=sum0+sum1;

doublex0;longint i0;doublesum0=0;for(i0=1;i0<=1000;i0++){x0=step*((double)i0- 0.5);sum0+=4.0/(1.0+x0*x0);

}

doublex1;longint i1;doublesum1=0;for(i1=1001;i1<=2000;i1++){x1=step*((double)i1- 0.5);sum1+=4.0/(1.0+x1*x1);}

Staticloopiterationassignment

18

#pragma omp parallel for for (i = 0; i < 6; i++)


i =0,1,2 i =3,4,5


i =0,1 i =2,3

Thread2

i =4,5

Thread0 Thread1OMP_NUM_THREADS=4 Thread2 Thread3

Thread0 Thread1OMP_NUM_THREADS=5 Thread2 Thread3 Thread4

OMP_NUM_THREADS=6 ......


19

#pragma omp parallel for for (i = 0; i < 6; i++)


i =0,1,2 i =3,4,5


i =0,1 i =2,3

Thread2

i =4,5


i =0,1 i =2,3

Thread2

i =4

Thread3

i =5


i =0,1 i =2

Thread2

i =3

Thread3

i =4

Thread4

i =5

OMP_NUM_THREADS=6 ......


20

#include <stdio.h>#include <omp.h>

int main (){

int i, N;N = 6;

#pragma omp parallel{

#pragma omp forfor (i = 0; i < 6; i++)

printf("Loop 1: iter %3d, thread_id %3d\n", i, omp_get_thread_num());

#pragma omp forfor (i = 0; i < 6; i++)

printf("Loop 2: iter %3d, thread_id %3d\n", i, omp_get_thread_num());}

return 0;}

Loop 1: iter 0, thread_id 0Loop 1: iter 1, thread_id 1Loop 1: iter 2, thread_id 2Loop 1: iter 3, thread_id 3Loop 1: iter 4, thread_id 4Loop 1: iter 5, thread_id 5Loop 2: iter 0, thread_id 0Loop 2: iter 1, thread_id 1Loop 2: iter 2, thread_id 2Loop 2: iter 3, thread_id 3Loop 2: iter 4, thread_id 4Loop 2: iter 5, thread_id 5


21

export OMP_NUM_THREADS=6


FirstRun SecondRun

Dynamicloopiterationassignment

22


int main (){

int i, N;N = 6;

#pragma omp parallel{

#pragma omp for schedule(dynamic,1)for (i = 0; i < 6; i++)

printf("Loop 1: iter %3d, thread_id %3d\n", i, omp_get_thread_num());

#pragma omp for schedule(dynamic,1)for (i = 0; i < 6; i++)

printf("Loop 2: iter %3d, thread_id %3d\n", i, omp_get_thread_num());}

return 0;}

exportOMP_SCHEDULE=“dynamic,1”

Dynamicloopiterationassignment

23


export OMP_NUM_THREADS=6


FirstRun SecondRun

Staticschedule– fairnessintermsofiterations

24

Thread0 Thread1 Thread2

1234

5678

9101112

Thread0 Thread1 Thread2

15678

9101112

2

3

4

Compu

tingtim

e

Dynamicschedule– fairnessintermsofload

25

Thread0 Thread1 Thread2 Thread0 Thread1 Thread2

15678

9101112

2

3

4

1 2 3

Compu

tingtim

e



26


15678

9101112

2

3

4

1 2 3

45 6

Compu

tingtim

e



27


15678

9101112

2

3

4

1 2 3

45 67 8

Compu

tingtim

e



28


15678

9101112

2

3

4

1 2 3

45 67 89 10

Compu

tingtim

e



29


15678

9101112

2

3

4

1 2 3

45 67 89 10

11 12

Compu

tingtim

e



30


15678910

1112

2

3

4

1 2 3

45 67 89 10

11 12

Compu

tingtim

e

exportOMP_SCHEDULE=“dynamic,1” exportOMP_SCHEDULE=“dynamic,2”

Whentousestaticschedule

31


int main (){int i, N;N = 6;

#pragma omp parallel{#pragma omp forfor (i = 0; i < 6; i++){printf("Loop 1: iter %3d, thread_id %3d\n", i, omp_get_thread_num());do something on a[i];......

}

#pragma omp forfor (i = 0; i < 6; i++)printf("Loop 2: iter %3d, thread_id %3d\n", i, omp_get_thread_num());do something else on a[i];......

}

return 0;}


Whentousedynamicschedule

32


int main (){int i, N;N = 6;

#pragma omp parallel{#pragma omp for schedule(dynamic,1)for (i = 0; i < 6; i++){printf("Loop 1: iter %3d, thread_id %3d\n", i, omp_get_thread_num());if (i == ...){}

}

#pragma omp for schedule(dynamic,1)for (i = 0; i < 6; i++)printf("Loop 2: iter %3d, thread_id %3d\n", i, omp_get_thread_num());if (i == ...){}

}

return 0;}

Avoidparallelregionsininnerloops

33

for(i=0;i<n;i++)for(j=0;j<n;j++)#pragmaomp parallel#pragmaomp forfor(k=0;k<n;k++){......}

#pragmaomp parallelfor(i=0;i<n;i++)for(j=0;j<n;j++)#pragmaomp forfor(k=0;k<n;k++){......}

overheadsareincurred𝑛) times Parallelconstructoverheadsareminimized

Collapsenestedloops

34

for(i=0;i<n;i++)for(j=0;j<n;j++)for(k=0;k<n;k++){......}

#pragmaomp parallelforcollapse(3)for(i=0;i<n;i++)for(j=0;j<n;j++)for(k=0;k<n;k++){......}

#pragmaomp parallelforfor(ijk=0;ijk<n*n*n;ijk++){......}

Equivalentto

Collapsenestedloops

35



for(i=0;i<n;i++)x=i;for(j=0;j<n;j++)for(k=0;k<n;k++){......}


Equivalentto

Collapsenestedloops

36



for(i=0;i<n;i++)x=i;for(j=0;j<n;j++)for(k=0;k<n;k++){......}


Equivalentto

for_nested.c(13): error: parallel loops with collapse must be perfectly nestedx = i;^

Novelorganic-inorganicperovskiteAMX3solarcells

37

2009,3.8%

25%21%

CH3NH3PbI3

Organic Inorganic Perovskite

PerovskiteindicatesthestructureofcrystalslikeCaTiO3

PerovskitematerialsaregenerallyrepresentedasAMX3

A:Cation M:MetalCation X:Anion

CH3NH3HC(NH2)2

PbSn

IClBr

http://facserv.utk.edu/sustainability/on-campus/energy/ DatafromNationalRenewableEnergyLaboratory

Perovskitesunderillumination

38

𝐸(𝑘)

𝑘

Convergenceofdielectricfunctionwithrespecttok-gridsampling

39

2x2x2 5x5x5

10x10x10 20x20x20

4.41hoursfor

30x30x30

FivelinestogetOpenMP parallelizationoftheloopoverk points

40

useOMP_LIB.......

blockinteger :: ik, nbandtot, nbandvalreal (DP) :: xk(3), scissornbandtot = ctrl%get_nband_tot()nbandval = ctrl%get_nband_val()scissor = ctrl%get_scissor_shift()!$OMP PARALLEL PRIVATE(ik, xk, eigvec, eig, hk, dpl)allocate ( hk(num_wann, num_wann) )allocate ( eig(num_wann) )allocate ( eigvec(num_wann, num_wann) )allocate ( dpl(num_wann, num_wann, 3) )!$OMP DOdo ik=1,nkpt

xk = kg%get_xk(ik)if (OMP_GET_THREAD_NUM() == 0) then

write ( *, '(A,I15,A,I15)' ) "Working on ", ik, " / ", nkptend ifcall recipe_k(p_ham,p_dpl_x,p_dpl_y,p_dpl_z,xk,num_wann,hk,eigvec,eig,dpl)call dist_k(wg,delta,nbandtot,nbandval,eig,scissor,dpl,epsi_k(:,:,ik),epsr_k(:,:,ik))

end do!$OMP END DOdeallocate( hk, eig, eigvec, dpl )!$OMP END PARALLEL

end block

......

Timingfor30x30x30

41

Threads Time(seconds) Time(minutes) Time(hours)

1 15893.38 264.89 4.41

2 7963.32 132.72 2.21

4 4106.97 68.45 1.14

8 2171.96 36.20 0.60

16 1106.89 18.45 0.31

DemonstrateAmdahl’slawbyvaryingthenumberofk points

42

MasterThread

MasterThread

Thread

Thread

MasterThread

Loopoverk pointsLoadtheinputdata Savetheoutputdata

DemonstrateAmdahl’slawbyvaryingthenumberofk points

43

Summary

44

q OpenMP syntaxanddirectives• Helloworldexample• 𝜋calculation• Manyloopsornestedloops

q Demonstrationwithrealresearchexample• Fivelinestoparallelizethecode• Amdahl’slaw

Onlineresources

45

vOpenMPhttp://www.openmp.org/

vLawrenceLivermoreNationalLaboratoryhttps://computing.llnl.gov/tutorials/openMP

vDartmouthResearchComputinghttps://www.dartmouth.edu/~rc/classes/intro_openmp

vUTexasComputingCenterhttp://pages.tacc.utexas.edu/~eijkhout/pcse/html/index.html

openmp101 · 2018. 3. 15. · two ways to specify the number of threads 10 omp_set_num_threads(4);...

Documents