report s2 - 東京大学nkl.cc.u-tokyo.ac.jp/14w/cw-mpi/s2-ref.pdf · report s2 kengo nakajima...

Rep

ort S

2

Ken

go N

akaj

ima

Tech

nica

l & S

cien

tific

Com

putin

g II

(482

0-10

28)

Sem

inar

on

Com

pute

r Sci

ence

II (4

810-

1205

)P

aral

lel F

EM

S2-

ref

2

•O

verv

iew

•D

istri

bute

d Lo

cal D

ata

•P

rogr

am•

Res

ults

S2-

ref

3

1D S

tatic

Lin

ear E

last

ic P

robl

em

x=0

(xmin

)x=

xmax

F

•O

nly

defo

rms

in x

-dire

ctio

n (d

ispl

acem

ent: u)

–U

nifo

rm: S

ectio

nal A

reaA,

You

ng’s

Mod

ulusE

–B

ound

ary

Con

ditio

ns (B

.C.)

•x=

0

: u=

0(f

ixed

)•x=x max

: F(a

xial

forc

e)

•Tr

uss:

NO

ben

ding

def

orm

atio

n by

G-fo

rce

S2-

ref

4

Cop

y an

d C

ompi

leCopy>$ cd <$O-fem2>

>$ cp /home/z30088/fem2/C/s2r.tar .

>$ tar xvf s2r.tar

<$O-fem2>/mpi/S2-ref

is c

reat

ed<$O-fem2>/mpi/S2-ref

is c

alle

d as

<$O-S2r>

Compile

>$ cd <$O-S2r>

>$ mpifccpx –Kfast 1d.c

S2-

ref

5

Con

trol

File

: inp

ut.d

atC

ontro

l Dat

ainput.dat

4NE

(Num

ber o

f Ele

men

ts) (

=40)

1.0 1.0 1.0 1.0

x(L

engt

h of

Eac

h E

lem

.: L

) ，F, A, E

100

Num

ber o

f MA

X. I

tera

tions

for C

G S

olve

r1.e-8

Con

verg

ence

Crit

eria

for C

G S

olve

r

12

34

51

23

4

Elem

ent I

DN

ode

ID (G

loba

l)

x=1

x=0

x=1

x=2

x=3

x=4

1111

11

Edxdu

AF

Stra

in=

1= 1

00%

S2-

ref

6

go.s

h#!/bin/sh

#PJM -L “node=4“

Node # (.le.12)

#PJM -L “elapse=00:10:00“

Comp.Time(.le.15min)

#PJM -L “rscgrp=lecture5“

“Queue” (or lecture)

#PJM -g “gt85“

“Wallet”

#PJM -

#PJM -o “test.lst“

Standard Output

#PJM --mpi“proc=64“

MPIProcess # (.le.192)

mpiexec./a.out

Process#=8

“node=1“

“proc=8”

Process#=16

“node=1“

“proc=16”

Process#=32

“node=2“

“proc=32”

Process#=64

“node=4“

“proc=64”

Process#=192

“node=12“

“proc=192”

S2-

ref

7

Proc

edur

es fo

r Par

alle

l FEM

•R

eadi

ng c

ontro

l file

, ent

ire e

lem

ent n

umbe

r etc

.•

Cre

atin

g “d

istri

bute

d lo

cal d

ata”

in th

e pr

ogra

m•

Ass

embl

ing

loca

l and

glo

bal m

atric

es fo

r lin

ear s

olve

rs•

Sol

ving

line

ar e

quat

ions

by

CG

•N

ot s

o di

ffere

nt fr

om th

ose

of o

rigin

al c

ode

S2-

ref

8

•O

verv

iew

•D

istr

ibut

ed L

ocal

Dat

a•

Pro

gram

•R

esul

ts

1D-P

art1

9

Fini

te E

lem

ent P

roce

dure

s•

Initi

aliz

atio

n–

Con

trol D

ata

–N

ode,

Con

nect

ivity

of E

lem

ents

(N: N

ode#

, NE

: Ele

m#)

–In

itial

izat

ion

of A

rray

s (G

loba

l/Ele

men

t Mat

rices

)–

Ele

men

t-Glo

bal M

atrix

Map

ping

(Ind

ex, I

tem

)•

Gen

erat

ion

of M

atrix

–E

lem

ent-b

y-E

lem

ent O

pera

tions

(do

icel

= 1,

NE

)•

Ele

men

t mat

rices

•A

ccum

ulat

ion

to g

loba

l mat

rix

–B

ound

ary

Con

ditio

ns•

Line

ar S

olve

r–

Con

juga

te G

radi

ent M

etho

d•

Cal

cula

tion

of S

tress

S2-

ref

1010

Quad

rila

tera

l Ele

ments

1

5 1

2

6 2

3

7 3

8 4

1

5 1

6 2

3

7 3

8 4

Info

rmat

ion

is n

ot s

uffic

ient

for

asse

mbl

ing

coef

ficie

nt m

atric

es.

Nod

e-ba

sed

parti

tioni

ngIn

depe

nden

t var

iabl

es a

re

defin

ed a

t nod

es (v

ertic

es).

21

5 1

6 2

3

8 4

7 3

2

6 2

7 3

Info

. of n

odes

and

ele

men

ts in

ov

erla

pped

zon

es a

re re

quire

d fo

r ass

embl

ing

coef

. mat

rices

.C

ompu

tatio

n of

stre

ss

com

pone

nts

need

s sa

me

info

.

S2-

ref

1111

Dis

trib

uted

Loc

al D

ata

Stru

ctur

efo

r Par

alle

l FEM

N

ode-

base

d pa

rtitio

ning

Lo

cal d

ata

incl

udes

:

Nod

es o

rigin

ally

ass

igne

d to

the

dom

ain/

PE

/par

titio

n

Ele

men

ts w

hich

incl

ude

abov

e no

des

N

odes

whi

ch a

re in

clud

ed a

bove

ele

men

ts, a

nd o

rigin

ally

NO

T-as

sign

ed to

the

dom

ain/

PE

/par

titio

n

3 ca

tego

ries

for n

odes

In

tern

al n

odes

Nod

es o

rigin

ally

ass

igne

d to

the

dom

ain/

PE

/par

titio

n

Ext

erna

l nod

es

Nod

es o

rigin

ally

NO

T-as

sign

ed to

the

dom

ain/

PE

/par

titio

n

Bou

ndar

y no

des

Ext

erna

l nod

es o

f oth

er d

omai

ns/P

E’s

/par

titio

ns

Com

mun

icat

ion

tabl

es

G

loba

l inf

o. is

not

nee

ded

exce

pt re

latio

nshi

p be

twee

n do

mai

ns

Pro

perty

of F

EM

: loc

al e

lem

ent-b

y-el

emen

t ope

ratio

ns

S2-

ref

1212

Nod

e-ba

sed

Part

ition

ing

inte

rnal

nod

es -

elem

ents

-ex

tern

al n

odes

12

3

45

67

89

11

10

14

13

15

12

PE#0

78

910

45

612

311

12

PE#1

71

23

10

911

12

56

84 PE#2

34

8

69

10

12

12

5

11

7

PE#3

１２

３４

5

21

22

23

24

25

16

17

18

20

11

12

13

14

15

67

89

10

19PE#0

PE#1

PE#2

PE#3

S2-

ref

1313

E

lem

ents

whi

ch in

clud

e In

tern

al N

odes

Nod

e-ba

sed

Part

ition

ing

inte

rnal

nod

es -

elem

ents

-ex

tern

al n

odes

89

11

10141315

12

E

xter

nal N

odes

incl

uded

in th

e E

lem

ents

in o

verla

pped

regi

on a

mon

g pa

rtitio

ns.

P

artit

ione

d no

des

them

selv

es (I

nter

nal N

odes

)

12

3

45

67

In

fo o

f Ext

erna

l Nod

es a

re re

quire

d fo

r com

plet

ely

loca

l el

emen

t–ba

sed

oper

atio

ns o

n ea

ch p

roce

ssor

.

S2-

ref

14

1D F

EM: 1

2 no

des/

11 e

lem

’s/3

dom

ains

01

23

45

67

89

1011

01

23

45

67

89

10

01

23

4

78

910

11

12

3

78

910

0

34

56

78

34

56

7

01

23

45

67

89

1011

12

34

56

78

910

0

S2-

ref

15

1D F

EM: 1

2 no

des/

11 e

lem

’s/3

dom

ains

Loca

l ID

: Sta

rting

from

0 fo

r nod

e an

d el

em a

t eac

h do

mai

n

01

23

4

40

12

3

12

3

30

12

0

40

12

35

30

12

4

#0

#1

#2

S2-

ref

16

1D F

EM: 1

2 no

des/

11 e

lem

’s/3

dom

ains

Inte

rnal

/Ext

erna

l Nod

es

01

23

4

40

12

3

12

3

30

12

0

40

12

35

30

12

4

#0

#1

#2

S2-

ref

17

1D F

EM: N

umbe

ring

of L

ocal

ID

01

…N

-1N

N0

1…

N-1

1N

-2N

-1

N-1

01

N-2

0

N0

1…

N-1

N+1

N-1

01

N-2

N

#0:

N+1

node

sN

elem

ents

Oth

ers

(Gen

eral

):N

+2no

des

N+1

ele

men

ts

#PE

Tot-1

: N

+1 n

odes

Nel

emen

ts

S2-

ref

18

1D F

EM: 1

2 no

des/

11 e

lem

’s/3

dom

ains

Inte

grat

ion

on e

ach

elem

ent,

elem

ent m

atrix

-> g

loba

l mat

rixO

pera

tions

can

be

done

by

info

. of i

nter

nal/e

xter

nal n

odes

an

d el

emen

ts w

hich

incl

ude

thes

e no

des

01

23

4

40

12

3

12

3

30

12

0

40

12

35

30

12

4

#0

#1

#2

S2-

ref

19

Prec

ondi

tione

d C

onju

gate

Gra

dien

t M

etho

d （C

G）

Compute r(0)= b-[A]x(0)

fori= 1, 2, …

solve [M]z(i-1)= r(i-1)

i-1= r

(i-1) z(i-1)

ifi=1

p(1)= z(0)

else

i-1= i

-1/ i

-2

p(i)= z(i-1) + i

-1p(i-1)

endif

q(i)= [A]p(i)

i = i

-1/p(i)q(i)

x(i)= x(i-1) + ip(i)

r(i)= r(i-1) -

iq(i)

check convergence |r|

end

Pre

cond

ition

ing:

Dia

gona

l Sca

ling

(or P

oint

Jac

obi)

S2-

ref

20

Prec

ondi

tioni

ng, D

AXP

YLo

cal O

pera

tions

by

Onl

y In

tern

al P

oint

s: P

aral

lel

Pro

cess

ing

is p

ossi

ble

/*

//--{x}= {x} + ALPHA*{p}

// {r}= {r} -ALPHA*{q}

*/for(i=0;i<N;i++){

U[i] += Alpha * W[P][i];

W[R][i] -= Alpha * W[Q][i];

}

/*//--

{z}= [Minv]{r}

*/

for(i=0;i<N;i++){

W[Z][i] = W[DD][i] * W[R][i];

}

0 1 2 3 4 5 6 7 8 9 10 11

S2-

ref

21

Dot

Pro

duct

sG

loba

l Sum

mat

ion

need

ed: C

omm

unic

atio

n ?

/*

//--

ALPHA= RHO / {p}{q}

*/C1 = 0.0;

for(i=0;i<N;i++){

C1 += W[P][i] * W[Q][i];

} Alpha = Rho / C1;

0 1 2 3 4 5 6 7 8 9 10 11

S2-

ref

22

Mat

rix-V

ecto

r Pro

duct

sV

alue

s at

Ext

erna

l Poi

nts:

P-to

-P C

omm

unic

atio

n

/*

//--

{q}= [A]{p}

*/for(i=0;i<N;i++){

W[Q][i] = Diag[i] * W[P][i];

for(j=Index[i];j<Index[i+1];j++){

W[Q][i] += AMat[j]*W[P][Item[j]];

}}

40

12

35

S2-

ref

23

1D F

EM: 1

2 no

des/

11 e

lem

’s/3

dom

ains

Inte

rnal

/Ext

erna

l Nod

es

01

23

4

40

12

3

12

3

30

12

0

40

12

35

30

12

4

#0

#1

#2

S2-

ref

24

Mat

-Vec

Pro

duct

s: L

ocal

Op.

Pos

sibl

e0

1

2

3

4

5

6

7

8

9

10

11

0 1 2 3 4 5 6 7 8 9 10 11

0 1 2 3 4 5 6 7 8 9 10 11

=

S2-

ref

25

Mat

-Vec

Pro

duct

s: L

ocal

Op.

Pos

sibl

e0

1

2

3

4

5

6

7

8

9

10

11

0 1 2 3 4 5 6 7 8 9 10 11

0 1 2 3 4 5 6 7 8 9 10 11

=

S2-

ref

26

Mat

-Vec

Pro

duct

s: L

ocal

Op.

Pos

sibl

e0

1

2

3

0

1

2

3

0

1

2

3

0 1 2 3 0 1 2 3 0 1 2 3

0 1 2 3 0 1 2 3 0 1 2 3

=

S2-

ref

27

Mat

-Vec

Pro

duct

s: L

ocal

Op.

#1

0

1

2

3

0 1 2 3

0 1 2 3

=

40

12

35

0

1

2

3

0 1 2 3

0 1 2 3

=

4 5

S2-

ref

28

SEN

D: M

PI_I

send

/Irec

v/W

aita

llne

ib#0

Send

Buf

neib

#1ne

ib#2

neib

#3

BU

Flen

gth_

eB

UFl

engt

h_e

BU

Flen

gth_

eB

UFl

engt

h_e

expo

rt_i

ndex

[0]

expo

rt_i

ndex

[1]

expo

rt_i

ndex

[2]

expo

rt_i

ndex

[3]

expo

rt_i

ndex

[4]

for (neib=0; neib<NeibPETot;neib++){

for (k=export_index[neib];k<export_index[neib+1];k++){

kk= export_item[k];

SendBuf[k]= VAL[kk];

}} for (neib=0; neib<NeibPETot; neib++){

tag= 0;

iS_e= export_index[neib];

iE_e= export_index[neib+1];

BUFlength_e= iE_e -iS_e

ierr= MPI_Isend

(&SendBuf[iS_e], BUFlength_e, MPI_DOUBLE, NeibPE[neib],0,

MPI_COMM_WORLD, &ReqSend[neib])

} MPI_Waitall(NeibPETot, ReqSend, StatSend);

expo

rt_ite

m (e

xpor

t_in

dex[

neib

]:exp

ort_

inde

x[ne

ib+1

]-1)

are

sent

to n

eib-

th n

eigh

bor

Cop

ied

to s

endi

ng b

uffe

rs

S2-

ref

29

REC

V: M

PI_I

send

/Irec

v/W

aita

ll

neib

#0R

ecvB

ufne

ib#1

neib

#2ne

ib#3

BU

Flen

gth_

iB

UFl

engt

h_i

BU

Flen

gth_

iB

UFl

engt

h_i

for (neib=0; neib<NeibPETot; neib++){

tag= 0;

iS_i= import_index[neib];

iE_i= import_index[neib+1];

BUFlength_i= iE_i -iS_i

ierr= MPI_Irecv

(&RecvBuf[iS_i], BUFlength_i, MPI_DOUBLE, NeibPE[neib],0,

MPI_COMM_WORLD, &ReqRecv[neib])

} MPI_Waitall(NeibPETot, ReqRecv, StatRecv);


for (k=import_index[neib];k<import_index[neib+1];k++){

kk= import_item[k];

VAL[kk]= RecvBuf[k];

}} im

port

_ind

ex[0

]im

port

_ind

ex[1

]im

port

_ind

ex[2

]im

port

_ind

ex[3

]im

port

_ind

ex[4

]

impo

rt_ite

m(im

port_

inde

x[ne

ib]:i

mpo

rt_in

dex[

neib

+1]-1

) ar

e re

ceiv

ed fr

om n

eib-

thne

ighb

or

Cop

ied

from

rece

ivin

g bu

ffer

S2-

ref

30

•O

verv

iew

•D

istri

bute

d Lo

cal D

ata

•Pr

ogra

m•

Res

ults

S2-

ref

31

Pro

gram

: 1d.

c (1

/11)

Var

iabl

es#include <stdio.h>

#include <stdlib.h>

#include <math.h>

#include <assert.h>

#include <mpi.h>

int main(int argc, char **argv){

int NE, N, NP, NPLU, IterMax, NEg, Ng, errno;

double dX, Resid, Eps, Area, F, Young;

double X1, X2, U1, U2, DL, Strain, Sigma, Ck;

double *U, *Rhs, *X, *Diag, *AMat;

double *R, *Z, *Q, *P, *DD;

int *Index, *Item, *Icelnod;

double Kmat[2][2], Emat[2][2];

int i, j, in1, in2, k, icel, k1, k2, jS;

int iter, nr, neib;

FILE *fp;

double BNorm2, Rho, Rho1=0.0, C1, Alpha, Beta, DNorm2;

int PETot, MyRank, kk, is, ir, len_s, len_r, tag;

int NeibPETot, BufLength, NeibPE[2];

int import_index[3], import_item[2];

int export_index[3], export_item[2];

double SendBuf[2], RecvBuf[2];

double BNorm20, Rho0, C10, DNorm20;

double StartTime, EndTime;

int ierr = 1;

MPI_Status *StatSend, *StatRecv;

MPI_Request *RequestSend, *RequestRecv;Es

sent

ial f

or p

rogr

ams

with

MPI

S2-

ref

32

Pro

gram

: 1d.

c (2

/11)

Con

trol D

ata

/*// +-------+

// | INIT. |

// +-------+

//=== */

/*//--

CONTROL data

*/

ierr = MPI_Init(&argc, &argv);

Initialization

ierr = MPI_Comm_size(MPI_COMM_WORLD, &PETot);

Entire Process #: PETot

ierr = MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);

Rank ID (0-PETot-1): MyRank

if(MyRank == 0){

fp = fopen("input.dat", "r");

assert(fp != NULL);

fscanf(fp, "%d", &NEg);

fscanf(fp, "%lf %lf %lf %lf", &dX, &F, &Area, &Young);

fscanf(fp, "%d", &IterMax);

fscanf(fp, "%lf", &Eps);

fclose(fp);

} ierr = MPI_Bcast(&NEg , 1, MPI_INT, 0, MPI_COMM_WORLD);

ierr = MPI_Bcast(&IterMax, 1, MPI_INT, 0, MPI_COMM_WORLD);

ierr = MPI_Bcast(&dX , 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

ierr = MPI_Bcast(&F , 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

ierr = MPI_Bcast(&Area , 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

ierr = MPI_Bcast(&Young , 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

ierr = MPI_Bcast(&Eps , 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

S2-

ref

33

Pro

gram

: 1d.

c (2

/11)

Con

trol D

ata

/*// +-------+

// | INIT. |

// +-------+

//=== */

/*//--

CONTROL data

*/


Initialization





if(MyRank == 0){

Reading control file if MyRank=0


assert(fp != NULL);

fscanf(fp, "%d", &NEg);




fclose(fp);


ierr = MPI_Bcast(&IterMax, 1, MPI_INT, 0, MPI_COMM_WORLD);






S2-

ref

34

Pro

gram

: 1d.

c (2

/11)

Con

trol D

ata

/*// +-------+

// | INIT. |

// +-------+

//=== */

/*//--

CONTROL data

*/


Initialization





if(MyRank == 0){

Reading control file if MyRank=0


assert(fp != NULL);

fscanf(fp, “%d”, &NEg);

Neg: Total Number of Elements (Global)




fclose(fp);


Parameters are sent to each process

ierr = MPI_Bcast(&IterMax, 1, MPI_INT, 0, MPI_COMM_WORLD); from Process #0.






S2-

ref

3535

MPI

_Bca

st•

Bro

adca

sts

a m

essa

ge fr

om th

e pr

oces

s w

ith ra

nk "r

oot"

to a

ll ot

her p

roce

sses

of

the

com

mun

icat

or

•MPI_Bcast (buffer,count,datatype,root,comm)

–buffer

choi

ce

I/O

star

ting

addr

ess

of b

uffe

rty

pe is

def

ined

by ”datatype”

–count

int

Inu

mbe

r of e

lem

ents

in s

end/

rece

ive

buffe

r–

datatype

MP

I_D

atat

ypeI

data

type

of e

lem

ents

of s

end/

reci

ve b

uffe

rFORTRAN MPI_INTEGER, MPI_REAL, MPI_DOUBLE_PRECISION, MPI_CHARACTER etc.

C MPI_INT, MPI_FLOAT, MPI_DOUBLE, MPI_CHAR etc.

–root

int

Ira

nk o

f roo

t pro

cess

–

comm

MP

I_C

ommI

com

mun

icat

or

A0

P#0

B0

C0

D0

P#1

P#2

P#3

A0

P#0

B0

C0

D0

P#1

P#2

P#3

Bro

adca

stA

0P

#0B

0C

0D

0

A0

P#1

B0

C0

D0

A0

P#2

B0

C0

D0

A0

P#3

B0

C0

D0

A0

P#0

B0

C0

D0

A0

P#1

B0

C0

D0

A0

P#2

B0

C0

D0

A0

P#3

B0

C0

D0

S2-

ref

36

Pro

gram

: 1d.

c (3

/11)

Dis

tribu

ted

Loca

l Mes

h/*//--LOCAL MESH size

*/

Ng= NEg + 1;

Ng: Number of Nodes (Global)

N = Ng / PETot;

N : Number of Nodes (Local)

nr = Ng -

N*PETot;

mod(Ng,PETot) .ne. 0

if(MyRank < nr) N++;

NE= N -1 + 2;

NP= N + 2;

if(MyRank == 0) NE= N -

1 + 1;

if(MyRank == 0) NP= N + 1;

if(MyRank == PETot-1) NE= N -1 + 1;

if(MyRank == PETot-1) NP= N + 1;

if(PETot==1){NE=N-1;}

if(PETot==1){NP=N ;}

/*/--Arrays

*/

U = calloc(NP, sizeof(double));

Diag = calloc(NP, sizeof(double));

AMat = calloc(2*NP-2, sizeof(double));

Rhs = calloc(NP, sizeof(double));

Index= calloc(NP+1, sizeof(int));

Item = calloc(2*NP-2, sizeof(int));

Icelnod= calloc(2*NE, sizeof(int));

S2-

ref

37

Pro

gram

: 1d.

c (3

/11)

Dis

tribu

ted

Loca

l Mes

h, U

nifo

rm E

lem

ents

/*//--LOCAL MESH size

*/

Ng= NEg + 1;


N = Ng / PETot;


nr = Ng -

N*PETot;



NE= N -1 + 2;

Number of Elements (Local)

NP= N + 2;

Total Number of Nodes (Local) (Internal + External Nodes)


1 + 1;






/*/--Arrays

*/








N0

1…

N-1

N+1

N-1

01

N-2

N

Oth

ers

(Gen

eral

):N

+2no

des

N+1

ele

men

ts

S2-

ref

38

Pro

gram

: 1d.

c (3

/11)

Dis

tribu

ted

Loca

l Mes

h, U

nifo

rm E

lem

ents


*/

Ng= NEg + 1;


N = Ng / PETot;


nr = Ng -

N*PETot;



NE= N -

1 + 2;

NP= N + 2;


1 + 1;






/*/--Arrays

*/








01

…N

-1N

1N

-2N

-10

#0:

N+1

node

sN

elem

ents

S2-

ref

39

Pro

gram

: 1d.

c (3

/11)

Dis

tribu

ted

Loca

l Mes

h, U

nifo

rm E

lem

ents


*/

Ng= NEg + 1;


N = Ng / PETot;


nr = Ng -

N*PETot;



NE= N -

1 + 2;

NP= N + 2;


1 + 1;






/*/--Arrays

*/








N0

1…

N-1

N-1

01

N-2

#PET

ot-1

: N

+1 n

odes

Nel

emen

ts

S2-

ref

40

Pro

gram

: 1d.

c (3

/11)

Dis

tribu

ted

Loca

l Mes

h, U

nifo

rm E

lem

ents


*/

Ng= NEg + 1;


N = Ng / PETot;


nr = Ng -

N*PETot;



NE= N -

1 + 2;

NP= N + 2;


1 + 1;






/*/--Arrays

*/


Size of arrays is “NP”, not “N”







S2-

ref

41

Pro

gram

: 1d.

c (4

/11)

Initi

aliz

atio

n of

Arr

ays,

Ele

men

ts-N

odes

for(i=0;i<NP;i++) U[i] = 0.0;

for(i=0;i<NP;i++) Diag[i] = 0.0;

for(i=0;i<NP;i++) Rhs[i] = 0.0;

for(k=0;k<2*NP-2;k++) AMat[k] = 0.0;

for(i=0;i<3;i++) import_index[i]= 0;

for(i=0;i<3;i++) export_index[i]= 0;

for(i=0;i<2;i++) import_item[i]= 0;

for(i=0;i<2;i++) export_item[i]= 0;

for(icel=0;icel<NE;icel++){

Icelnod[2*icel ]= icel;

Icelnod[2*icel+1]= icel+1;

} if(PETot>1){

if(MyRank==0){

icel= NE-1;

Icelnod[2*icel ]= N-1;

Icelnod[2*icel+1]= N;

}else if(MyRank==PETot-1){

icel= NE-1;

Icelnod[2*icel ]= N;

Icelnod[2*icel+1]= 0;

}else{

icel= NE-2;



icel= NE-1;


Icelnod[2*icel+1]= N+1;

}}

icel

Icelnod[2*icel]

=icel

Icelnod[2*icel+1]

=icel+1

S2-

ref

42

Pro

gram

: 1d.

c (4

/11)

Initi

aliz

atio

n of

Arr

ays,

Ele

men

ts-N

odes

for(i=0;i<NP;i++) U[i] = 0.0;

for(i=0;i<NP;i++) Diag[i] = 0.0;

for(i=0;i<NP;i++) Rhs[i] = 0.0;

for(k=0;k<2*NP-2;k++) AMat[k] = 0.0;

for(i=0;i<3;i++) import_index[i]= 0;

for(i=0;i<3;i++) export_index[i]= 0;

for(i=0;i<2;i++) import_item[i]= 0;

for(i=0;i<2;i++) export_item[i]= 0;


Icelnod[2*icel ]= icel;

Icelnod[2*icel+1]= icel+1;

} if(PETot>1){

if(MyRank==0){

icel= NE-1;


Icelnod[2*icel+1]= N;

}else if(MyRank==PETot-1){

icel= NE-1;



}else{

icel= NE-2;



icel= NE-1;


Icelnod[2*icel+1]= N+1;

}}

e.g.

Ele

men

t-0 in

clud

es n

ode-

0 an

d no

de-1

01

…N

-1N

N0

1…

N-1

1N

-2N

-1

N-1

01

N-2

0

N0

1…

N-1

N+1

N-1

01

N-2

N

#0:

N+1

node

sN

elem

ents

Oth

ers

(Gen

eral

):N

+2no

des

N+1

ele

men

ts

#PET

ot-1

: N

+1 n

odes

Nel

emen

ts

S2-

ref

43

Pro

gram

: 1d.

c (5

/11)

”Inde

x”

Kmat[0][0]= +1.0;

Kmat[0][1]= -1.0;

Kmat[1][0]= -1.0;

Kmat[1][1]= +1.0;

/*// +--------------+

// | CONNECTIVITY |

// +--------------+

*/

for(i=0;i<N+1;i++) Index[i] = 2;

for(i=N+1;i<NP+1;i++) Index[i] = 1;

Index[0] = 0;

if(MyRank == 0) Index[1] = 1;

if(MyRank == PETot-1) Index[N] = 1;

for(i=0;i<NP;i++){

Index[i+1]= Index[i+1] + Index[i];

} NPLU= Index[NP];

01

…N

-1N

N0

1…

N-1

1N

-2N

-1

N-1

01

N-2

0

N0

1…

N-1

N+1

N-1

01

N-2

N

#0:

N+1

node

sN

elem

ents

Oth

ers

(Gen

eral

):N

+2no

des

N+1

ele

men

ts

#PET

ot-1

: N

+1 n

odes

Nel

emen

ts

S2-

ref

44

Pro

gram

: 1d.

c (6

/11)

”Item

” for(i=0;i<N;i++){

jS = Index[i];

if((MyRank==0)&&(i==0)){

Item[jS] = i+1;

}else if((MyRank==PETot-1)&&(i==N-1)){

Item[jS] = i-1;

}else{

Item[jS] = i-1;

Item[jS+1] = i+1;

if(i==0) { Item[jS] = N;}

if(i==N-1){ Item[jS+1]= N+1;}

if((MyRank==0)&&(i==N-1)){Item[jS+1]= N;}

}} i =N;

jS= Index[i];

if (MyRank==0) {

Item[jS]= N-1;

}else {

Item[jS]= 0;

}

i =N+1;

jS= Index[i];

if ((MyRank!=0)&&(MyRank!=PETot-1)) {

Item[jS]= N-1;

}

01

…N

-1N

N0

1…

N-1

1N

-2N

-1

N-1

01

N-2

0

N0

1…

N-1

N+1

N-1

01

N-2

N

#0:

N+1

node

sN

elem

ents

Oth

ers

(Gen

eral

):N

+2no

des

N+1

ele

men

ts

#PET

ot-1

: N

+1 n

odes

Nel

emen

ts

S2-

ref

45

Pro

gram

: 1d.

c (7

/11)

Com

mun

icat

ion

Tabl

es/*

//--

COMMUNICATION

*/

NeibPETot = 2;

if(MyRank == 0) NeibPETot = 1;

if(MyRank == PETot-1) NeibPETot = 1;

if(PETot == 1) NeibPETot = 0;

NeibPE[0] = MyRank -1;

NeibPE[1] = MyRank + 1;

if(MyRank == 0)

NeibPE[0] = MyRank + 1;

if(MyRank == PETot-1) NeibPE[0] = MyRank -1;

import_index[1]=1;

import_index[2]=2;

import_item[0]= N;

import_item[1]= N+1;

export_index[1]=1;

export_index[2]=2;

export_item[0]= 0;

export_item[1]= N-1;

if(MyRank == 0) import_item[0]=N;

if(MyRank == 0) export_item[0]=N-1;

BufLength = 1;

StatSend = malloc(sizeof(MPI_Status) * NeibPETot);

StatRecv = malloc(sizeof(MPI_Status) * NeibPETot);

RequestSend = malloc(sizeof(MPI_Request) * NeibPETot);

RequestRecv = malloc(sizeof(MPI_Request) * NeibPETot);

01

…N

-1N

N0

1…

N-1

1N

-2N

-1

N-1

01

N-2

0

N0

1…

N-1

N+1

N-1

01

N-2

N

#0:

N+1

node

sN

elem

ents

Oth

ers

(Gen

eral

):N

+2no

des

N+1

ele

men

ts

#PET

ot-1

: N

+1 n

odes

Nel

emen

ts

MPI

_Ise

nd•

Beg

ins

a no

n-bl

ocki

ng s

end

–S

end

the

cont

ents

of s

endi

ng b

uffe

r (st

artin

g fro

m sendbuf

, num

ber o

f mes

sage

s: count

) to

dest

with

tag

. –

Con

tent

s of

sen

ding

buf

fer c

anno

t be

mod

ified

bef

ore

callin

g co

rresp

ondi

ng MPI_Waitall

.

•MPI_Isend

(sendbuf,count,datatype,dest,tag,comm,request)

–sendbuf

choi

ceI

star

ting

addr

ess

of s

endi

ng b

uffe

r–

count

int

Inu

mbe

r of e

lem

ents

in s

endi

ng b

uffe

r–

datatype

MP

I_D

atat

ypeI

data

type

of e

ach

send

ing

buffe

r ele

men

t–

dest

int

Ira

nk o

f des

tinat

ion

–tag

int

Im

essa

ge ta

gTh

is in

tege

r can

be

used

by

the

appl

icat

ion

to d

istin

guis

h m

essa

ges.

Com

mun

icat

ion

occu

rs if

tag’s

of

MPI_Isend

and MPI_Irecv

are

mat

ched

. U

sual

ly ta

g is

set

to b

e “0

” (in

this

cla

ss),

–comm

MP

I_C

ommI

com

mun

icat

or–

request

MP

I_R

eque

stO

com

mun

icat

ion

requ

est a

rray

used

in MPI_Waitall

46S

2-re

f

MPI

_Ire

cv•

Beg

ins

a no

n-bl

ocki

ng re

ceiv

e –

Rec

eivi

ng th

e co

nten

ts o

f rec

eivi

ng b

uffe

r (st

artin

g fro

m recvbuf

, num

ber o

f mes

sage

s:

count

) fro

m source

with

tag

. –

Con

tent

s of

rece

ivin

g bu

ffer c

anno

t be

used

bef

ore

callin

g co

rresp

ondi

ng MPI_Waitall

.

•MPI_Irecv

(recvbuf,count,datatype,source,tag,comm,request)

–recvbuf

choi

ceI

star

ting

addr

ess

of re

ceiv

ing

buffe

r–

count

int

Inu

mbe

r of e

lem

ents

in re

ceiv

ing

buffe

r–

datatype

MP

I_D

atat

ypeI

data

type

of e

ach

rece

ivin

g bu

ffer e

lem

ent

–source

int

Ira

nk o

f sou

rce

–tag

int

Im

essa

ge ta

gTh

is in

tege

r can

be

used

by

the

appl

icat

ion

to d

istin

guis

h m

essa

ges.

Com

mun

icat

ion

occu

rs if

tag’s

of

MPI_Isend

and MPI_Irecv

are

mat

ched

. U

sual

ly ta

g is

set

to b

e “0

” (in

this

cla

ss),

–comm

MP

I_C

ommI

com

mun

icat

or–

request

MP

I_R

eque

stO

com

mun

icat

ion

requ

est a

rray

used

in MPI_Waitall

47S

2-re

f

MPI

_Wai

tall

•MPI_Waitall

bloc

ks u

ntil

all c

omm

’s, a

ssoc

iate

d w

ith request

in th

e ar

ray,

co

mpl

ete.

It is

use

d fo

r syn

chro

nizi

ng MPI_Isend

and MPI_Irecv

in th

is c

lass

.•

At s

endi

ng p

hase

, con

tent

s of

sen

ding

buf

fer c

anno

t be

mod

ified

bef

ore

callin

g co

rresp

ondi

ng MPI_Waitall

. At r

ecei

ving

pha

se, c

onte

nts

of re

ceiv

ing

buffe

r ca

nnot

be

used

bef

ore

callin

g co

rresp

ondi

ng MPI_Waitall

.•

MPI_Isend

and MPI_Irecv

can

be s

ynch

roni

zed

sim

ulta

neou

sly

with

a s

ingl

e MPI_Waitall

if it

is c

onsi

tent

.–

Sam

e request

shou

ld b

e us

ed in

MPI_Isend

and MPI_Irecv

.•

Its o

pera

tion

is s

imila

r to

that

of MPI_Barrier

but, MPI_Waitall

can

not b

e re

plac

ed b

yMPI_Barrier

.–

Pos

sibl

e tro

uble

s us

ing MPI_Barrier

inst

ead

of MPI_Waitall

: Con

tent

s of

request

and

status

are

not u

pdat

ed p

rope

rly, v

ery

slow

ope

ratio

ns e

tc.

•MPI_Waitall

(count,request,status)

–count

int

Inu

mbe

r of p

roce

sses

to b

e sy

nchr

oniz

ed

–request

MP

I_R

eque

stI/O

com

m. r

eque

st u

sed

in MPI_Waitall

(arr

ay s

ize:

count

)–

status

MP

I_S

tatu

sO

arra

y of

sta

tus

obje

cts

MPI_STATUS_SIZE: defined in ‘mpif.h’, ‘mpi.h’

48S

2-re

f

Gen

eral

ized

Com

m. T

able

: Sen

d•

Nei

ghbo

rs–

Nei

bPE

Tot，

Nei

bPE

[nei

b]•

Mes

sage

siz

e fo

r eac

h ne

ighb

or–

expo

rt_in

dex[

neib

], ne

ib=

0, N

eibP

ETo

t•

ID o

f bou

ndar

ypo

ints

–ex

port_

item

[k],

k= 0

, exp

ort_

inde

x[N

eibP

ETo

t]-1

•M

essa

ges

to e

ach

neig

hbor

–S

endB

uf[k

], k=

0, e

xpor

t_in

dex[

Nei

bPE

Tot]-

1

49S

2-re

f

S2-

ref

50

SEN

D: M

PI_I

send

/Irec

v/W

aita

llne

ib#0

Send

Buf

neib

#1ne

ib#2

neib

#3

BU

Flen

gth_

eB

UFl

engt

h_e

BU

Flen

gth_

eB

UFl

engt

h_e

expo

rt_i

ndex

[0]

expo

rt_i

ndex

[1]

expo

rt_i

ndex

[2]

expo

rt_i

ndex

[3]

expo

rt_i

ndex

[4]



kk= export_item[k];



tag= 0;




ierr= MPI_Isend

(&SendBuf[iS_e], BUFlength_e, MPI_DOUBLE, NeibPE[neib],0,



expo

rt_ite

m (e

xpor

t_in

dex[

neib

]:exp

ort_

inde

x[ne

ib+1

]-1)

are

sent

to n

eib-

th n

eigh

bor

Cop

ied

to s

endi

ng b

uffe

rs

SEN

D/E

xpor

t: 1D

Pro

blem

•N

eigh

bors

–N

eibP

ETo

t，N

eibP

E[n

eib]

•N

eibP

ETo

t=2,

Nei

bPE

[0]=

my_

rank

-1, N

eibP

E[1

]= m

y_ra

nk+1

•M

essa

ge s

ize

for e

ach

neig

hbor

–ex

port_

inde

x[ne

ib],

neib

= 0,

Nei

bPE

Tot

•ex

port_

inde

x[0]

=0, e

xpor

t_in

dex[

1]=

1, e

xpor

t_in

dex[

2]=

2

•ID

of b

ound

ary

poin

ts–

expo

rt_ite

m[k

], k=

0, e

xpor

t_in

dex[

Nei

bPE

Tot]-

1•

expo

rt_ite

m[0

]= 0

, exp

ort_

item

[1]=

N-1

•M

essa

ges

to e

ach

neig

hbor

–S

endB

uf[k

], k=

0, e

xpor

t_in

dex[

Nei

bPE

Tot]-

1•

Sen

dBuf

[0]=

VA

L[0]

, Sen

dBuf

[1]=

VA

L[N

-1]

51S

2-re

f

40

12

35

SendBuf[0]=VAL[0]

SendBuf[1]=VAL[3]

Gen

eral

ized

Com

m. T

able

: Rec

eive

•N

eigh

bors

–N

eibP

ETo

t ，N

eibP

E[n

eib]

•M

essa

ge s

ize

for e

ach

neig

hbor

–im

port_

inde

x[ne

ib],

neib

= 0,

Nei

bPE

Tot

•ID

of e

xter

nalp

oint

s–

impo

rt_ite

m[k

], k=

0, i

mpo

rt_in

dex[

Nei

bPE

Tot]-

1•

Mes

sage

s fro

m e

ach

neig

hbor

–R

ecvB

uf[k

], k=

0, i

mpo

rt_in

dex[

Nei

bPE

Tot]-

1

52S

2-re

f

S2-

ref

53

REC

V: M

PI_I

send

/Irec

v/W

aita

ll

neib

#0R

ecvB

ufne

ib#1

neib

#2ne

ib#3

BU

Flen

gth_

iB

UFl

engt

h_i

BU

Flen

gth_

iB

UFl

engt

h_i


tag= 0;




ierr= MPI_Irecv






kk= import_item[k];


}} im

port

_ind

ex[0

]im

port

_ind

ex[1

]im

port

_ind

ex[2

]im

port

_ind

ex[3

]im

port

_ind

ex[4

]

impo

rt_ite

m(im

port_

inde

x[ne

ib]:i

mpo

rt_in

dex[

neib

+1]-1

) ar

e re

ceiv

ed fr

om n

eib-

thne

ighb

or

Cop

ied

from

rece

ivin

g bu

ffer

REC

V/Im

port

: 1D

Pro

blem

•N

eigh

bors

–N

eibP

ETo

t，N

eibP

E[n

eib]

•N

eibP

ETo

t=2,

Nei

bPE

[0]=

my_

rank

-1, N

eibP

E[1

]= m

y_ra

nk+1

•M

essa

ge s

ize

for e

ach

neig

hbor

–im

port_

inde

x[ne

ib],

neib

= 0,

Nei

bPE

Tot

•im

port_

inde

x[0]

=0, i

mpo

rt_in

dex[

1]=

1, im

port_

inde

x[2]

= 2

•ID

of e

xter

nalp

oint

s–

impo

rt_ite

m[k

], k=

0, i

mpo

rt_in

dex[

Nei

bPE

Tot]-

1•

impo

rt_ite

m[0

]= N

, im

port_

item

[1]=

N+1

•M

essa

ges

from

eac

h ne

ighb

or–

RE

CV

buf[k

], k=

0, i

mpo

rt_in

dex[

Nei

bPE

Tot]-

1•

VA

L[N

]=R

ecvB

uf[0

], V

AL[

N+1

]=R

ecvB

uf[1

]

54S

2-re

f

40

12

35

VAL[4]=RecvBuf[0]

VAL[5]=RecvBuf[1]

S2-

ref

55

Pro

gram

: 1d.

c (8

/11)

Mat

rix A

ssem

blin

g, N

O c

hang

es fr

om 1

-CP

U c

ode

/*// +-----------------+

// | MATRIX assemble |

// +-----------------+

*/


in1= Icelnod[2*icel];

in2= Icelnod[2*icel+1];

DL = dX;

Ck= Area*Young/DL;

Emat[0][0]= Ck*Kmat[0][0];

Emat[0][1]= Ck*Kmat[0][1];

Emat[1][0]= Ck*Kmat[1][0];

Emat[1][1]= Ck*Kmat[1][1];

Diag[in1]= Diag[in1] + Emat[0][0];

Diag[in2]= Diag[in2] + Emat[1][1];

if ((MyRank==0)&&(icel==0)){

k1=Index[in1];

}else {k1=Index[in1]+1;}

k2=Index[in2];

AMat[k1]= AMat[k1] + Emat[0][1];

AMat[k2]= AMat[k2] + Emat[1][0];

}

01

23

4

40

12

3

12

3

30

12

0

40

12

35

30

12

4

#0

#1

#2

S2-

ref

56

Pro

gram

: 1d.

c (9

/11)

Bou

ndar

y C

ond.

, ALM

OS

T N

O c

hang

es fr

om 1

-CP

U c

ode

/*// +---------------------+

// | BOUNDARY conditions |

// +---------------------+

*/

/* X=Xmin */

if (MyRank==0){

i=0;

jS= Index[i];

AMat[jS]= 0.0;

Diag[i ]= 1.0;

Rhs [i ]= 0.0;

for(k=0;k<NPLU;k++){

if(Item[k]==0){AMat[k]=0.0;}

}}

/* X=Xmax */

if (MyRank==PETot-1){

i=N-1;

Rhs[i]= F;

}

01

23

4

40

12

3

12

3

30

12

0

40

12

35

30

12

4

#0

#1

#2

S2-

ref

57

Pro

gram

: 1d.

c（10

/11）

Con

juga

te G

radi

ent M

etho

d/*// +---------------+

// | CG iterations |

// +---------------+

//=== */

R = calloc(NP, sizeof(double));

Z = calloc(NP, sizeof(double));

P = calloc(NP, sizeof(double));

Q = calloc(NP, sizeof(double));

DD= calloc(NP, sizeof(double));

for(i=0;i<N;i++){

DD[i]= 1.0 / Diag[i];

}

/*//--

{r0}= {b} -[A]{xini} |

*/

for(neib=0;neib<NeibPETot;neib++){

for(k=export_index[neib];k<export_index[neib+1];k++){

kk= export_item[k];

SendBuf[k]= U[kk];

}}

Compute r(0)= b-[A]x(0)

fori= 1, 2, …

solve [M]z(i-1)= r(i-1)

i-1= r(i-1) z(i-1)

ifi=1

p(1)= z(0)

else

i-1= i

-1/ i

-2

p(i)= z(i-1) + i

-1p(i-1)

endif

q(i)= [A]p(i)

i = i

-1/p(i)q(i)

x(i)= x(i-1) + ip(i)

r(i)= r(i-1) -

iq(i)

check convergence |r|

end

S2-

ref

58

Con

juga

te G

radi

ent M

etho

d (C

G)

•M

atrix

-Vec

tor M

ultip

ly•

Dot

Pro

duct

•P

reco

nditi

onin

g: in

the

sam

e w

ay a

s 1C

PU

cod

e•

DA

XP

Y: i

n th

e sa

me

way

as

1CP

U c

ode

S2-

ref

59

Prec

ondi

tioni

ng, D

AXP

Y/*

//--{z}= [Minv]{r}

*/

for(i=0;i<N;i++){

Z[i] = DD[i] * R[i];

}

/*//--{x}= {x} + ALPHA*{p}

// {r}= {r} -ALPHA*{q}

*/

for(i=0;i<N;i++){

U[i] += Alpha * P[i];

R[i] -= Alpha * Q[i];

}

S2-

ref

60

Mat

rix-V

ecto

r Mul

tiply

(1/2

)U

sing

Com

m. T

able

, {p}

is u

pdat

ed b

efor

e co

mpu

tatio

n /*//--

{q}= [A]{p}

*/


for(k=export_index[neib];k<export_index[neib+1];k++){

kk= export_item[k];

SendBuf[k]= P[kk];

}}


is = export_index[neib];

len_s= export_index[neib+1] -export_index[neib];

MPI_Isend(&SendBuf[is], len_s, MPI_DOUBLE, NeibPE[neib],

0, MPI_COMM_WORLD, &RequestSend[neib]);

} for(neib=0;neib<NeibPETot;neib++){

ir = import_index[neib];

len_r= import_index[neib+1] -import_index[neib];

MPI_Irecv(&RecvBuf[ir], len_r, MPI_DOUBLE, NeibPE[neib],

0, MPI_COMM_WORLD, &RequestRecv[neib]);

} MPI_Waitall(NeibPETot, RequestRecv, StatRecv);


for(k=import_index[neib];k<import_index[neib+1];k++){

kk= import_item[k];

P[kk]=RecvBuf[k];

}}

S2-

ref

61

Mat

rix-V

ecto

r Mul

tiply

(2/2

){q

}= [A

]{p}

MPI_Waitall(NeibPETot, RequestSend, StatSend);

for(i=0;i<N;i++){

Q[i] = Diag[i] * P[i];

for(j=Index[i];j<Index[i+1];j++){

Q[i] += AMat[j]*P[Item[j]];

}}

S2-

ref

62

Dot

Pro

duct

Glo

bal S

umm

atio

n by

MP

I_A

llred

uce

/*

//--RHO= {r}{z}

*/

Rho0= 0.0;

for(i=0;i<N;i++){

Rho0 += R[i] * Z[i];

} ierr = MPI_Allreduce(&Rho0, &Rho, 1, MPI_DOUBLE,

MPI_SUM, MPI_COMM_WORLD);

S2-

ref

6363

MPI

_Red

uce

•R

educ

es v

alue

s on

all

proc

esse

s to

a s

ingl

e va

lue

–S

umm

atio

n, P

rodu

ct, M

ax, M

in e

tc.

•MPI_Reduce(sendbuf,recvbuf,count,datatype,op,root,comm)

–sendbuf

choi

ceI

star

ting

addr

ess

of s

end

buffe

r–

recvbuf

choi

ce

Ost

artin

g ad

dres

s re

ceiv

e bu

ffer

type

is d

efin

ed b

y ”datatype”

–count

int

Inu

mbe

r of e

lem

ents

in s

end/

rece

ive

buffe

r–

datatype

MP

I_D

atat

ypeI

data

type

of e

lem

ents

of s

end/

reci

vebu

ffer

FORTRAN MPI_INTEGER, MPI_REAL, MPI_DOUBLE_PRECISION, MPI_CHARACTER etc.

C MPI_INT, MPI_FLOAT, MPI_DOUBLE, MPI_CHAR etc

–op

MP

I_O

pI

redu

ce o

pera

tion

MPI_MAX, MPI_MIN, MPI_SUM, MPI_PROD, MPI_LAND, MPI_BAND etc

Use

rs c

an d

efin

e op

erat

ions

by MPI_OP_CREATE

–root

int

Ira

nk o

f roo

t pro

cess

–

comm

MP

I_C

ommI

com

mun

icat

or

Red

uce

P#0

P#1

P#2

P#3

A0

P#0

B0

C0

D0

A1

P#1

B1

C1

D1

A2

P#2

B2

C2

D2

A3

P#3

B3

C3

D3

A0

P#0

B0

C0

D0

A1

P#1

B1

C1

D1

A2

P#2

B2

C2

D2

A3

P#3

B3

C3

D3

op.A

0-A

3op

.B0-

B3

op.C

0-C

3op

.D0-

D3

op.A

0-A

3op

.B0-

B3

op.C

0-C

3op

.D0-

D3

S2-

ref

6464

Send

/Rec

eive

Buf

fer

(Sen

ding

/Rec

eivi

ng)

•A

rray

s of

“sen

d (s

endi

ng) b

uffe

r” a

nd “r

ecei

ve

(rec

eivi

ng) b

uffe

r” o

ften

appe

ar in

MP

I.

•A

ddre

sses

of “

send

(sen

ding

) buf

fer”

and

“rec

eive

(r

ecei

ving

) buf

fer”

mus

t be

diffe

rent

.

S2-

ref

6565

Exam

ple

of M

PI_R

educ

e (1

/2)

call MPI_REDUCE

(sendbuf,recvbuf,count,datatype,op,root,comm,ierr)

real(kind=8):: X0, X1

call MPI_REDUCE

(X0, X1, 1, MPI_DOUBLE_PRECISION, MPI_MAX, 0, <comm>, ierr)

real(kind=8):: X0(4), XMAX(4)

call MPI_REDUCE

(X0, XMAX, 4, MPI_DOUBLE_PRECISION, MPI_MAX, 0, <comm>, ierr)

Glo

bal M

ax v

alue

s of

X0[

i] go

to X

MA

X[i]

on

#0 p

roce

ss （

i=0~

3）

S2-

ref

6666

Exam

ple

of M

PI_R

educ

e (2

/2)

call MPI_REDUCE

(sendbuf,recvbuf,count,datatype,op,root,comm,ierr)

real(kind=8):: X0, XSUM

call MPI_REDUCE

(X0, XSUM, 1, MPI_DOUBLE_PRECISION, MPI_SUM, 0, <comm>, ierr)

real(kind=8):: X0(4)

call MPI_REDUCE

(X0(1), X0(3), 2, MPI_DOUBLE_PRECISION, MPI_SUM, 0, <comm>, ierr)

Glo

bal s

umm

atio

n of

X0

goes

to X

SU

Mon

#0

proc

ess.

・G

loba

l sum

mat

ion

of X

0[0]

goe

s to

X0[

2]on

#0

proc

ess.

・G

loba

l sum

mat

ion

of X

0[1]

goe

s to

X0[

3]on

#0

proc

ess.

S2-

ref

67

MPI

_Aｌｌre

duce

•M

PI_

Red

uce

+ M

PI_

Bca

st•

Sum

mat

ion

(of d

ot p

rodu

cts)

and

MA

X/M

IN v

alue

s ar

e lik

ely

to u

tiliz

ed in

ea

ch p

roce

ss

•call MPI_Allreduce

(sendbuf,recvbuf,count,datatype,op, comm)

–sendbuf

choi

ceI

star

ting

addr

ess

of s

end

buffe

r–

recvbuf

choi

ce

Ost

artin

g ad

dres

s re

ceiv

e bu

ffer

type

is d

efin

ed b

y ”datatype”

–count

int

Inu

mbe

r of e

lem

ents

in s

end/

rece

ive

buffe

r–

datatype

MP

I_D

atat

ypeI

data

type

of e

lem

ents

of s

end/

reci

ve b

uffe

r

–op

MP

I_O

pI

redu

ce o

pera

tion

–comm

MP

I_C

ommI

com

mun

icat

or

All

redu

ceP

#0

P#1

P#2

P#3

A0

P#0

B0

C0

D0

A1

P#1

B1

C1

D1

A2

P#2

B2

C2

D2

A3

P#3

B3

C3

D3

A0

P#0

B0

C0

D0

A1

P#1

B1

C1

D1

A2

P#2

B2

C2

D2

A3

P#3

B3

C3

D3

op.A

0-A

3op

.B0-

B3

op.C

0-C

3op

.D0-

D3

op.A

0-A

3op

.B0-

B3

op.C

0-C

3op

.D0-

D3

op.A

0-A

3op

.B0-

B3

op.C

0-C

3op

.D0-

D3

op.A

0-A

3op

.B0-

B3

op.C

0-C

3op

.D0-

D3

op.A

0-A

3op

.B0-

B3

op.C

0-C

3op

.D0-

D3

op.A

0-A

3op

.B0-

B3

op.C

0-C

3op

.D0-

D3

op.A

0-A

3op

.B0-

B3

op.C

0-C

3op

.D0-

D3

op.A

0-A

3op

.B0-

B3

op.C

0-C

3op

.D0-

D3

S2-

ref

68

Pro

gram

: 1d.

c（11

/11）

Out

put b

y E

ach

Pro

cess

/*//--OUTPUT

*/

printf("¥n%s¥n", "### DISPLACEMENT");

for(i=0;i<N;i++){

printf("%3d%8d%16.6E¥n", MyRank, i+1, U[i]);

} ierr = MPI_Finalize();

return ierr;

}

S2-

ref

69

Out

put L

ist:

Not

org

aniz

ed### DISPLACEMENT

0 1 0.000000E+00

0 2 1.000000E-01

0 3 2.000000E-01

0 4 3.000000E-01

0 5 4.000000E-01

0 6 5.000000E-01

0 7 6.000000E-01

0 8 7.000000E-01

0 9 8.000000E-01

0 10 9.000000E-01

0 11 1.000000E-00

### DISPLACEMENT

1 1 1.100000E+00

1 2 1.200000E+00

1 3 1.300000E+00

1 4 1.400000E+00

1 5 1.500000E+00

1 6 1.600000E+00

1 7 1.700000E+00

1 8 1.800000E+00

1 9 1.900000E+00

1 10 2.000000E+00

### DISPLACEMENT

3 1 3.100000E+00

3 2 3.200000E+00

3 3 3.300000E+00

3 4 3.400000E+00

3 5 3.500000E+00

3 6 3.600000E+00

3 7 3.700000E+00

3 8 3.800000E+00

3 9 3.900000E+00

3 10 4.000000E+00

### DISPLACEMENT

2 1 2.100000E+00

2 2 2.200000E+00

2 3 2.300000E+00

2 4 2.400000E+00

2 5 2.500000E+00

2 6 2.600000E+00

2 7 2.700000E+00

2 8 2.800000E+00

2 9 2.900000E+00

2 10 3.000000E+00

S2-

ref

70

•O

verv

iew

•D

istri

bute

d Lo

cal D

ata

•P

rogr

am•

Res

ults

S2-

ref

71

Res

ults

for C

G S

olve

rTi

me

for 1

00 It

erat

ions

in N

=106

case

base

d on

a co

reba

sed

ona

node

(16

core

s)

1.0

2.0

3.7

5.9

7.1

17.6

42.1

58.3

70.5

80

.7

87.5

1.0

10.0

100.

0

110

100

Speed-Up

Cor

e #

idea

lN

=10^

4N

=10^

6

16.0

39.9

95.0

131.

7 159.

2 18

2.2

197.

6

10.0

100.

0

1010

0

Speed-Up

Cor

e #

idea

l

N=1

0^6

72

Perf

orm

ance

is lo

wer

than

idea

l one

•Ti

me

for M

PI c

omm

unic

atio

n–

Tim

e fo

r sen

ding

dat

a–

Com

mun

icat

ion

band

wid

th b

etw

een

node

s–

Tim

e is

pro

porti

onal

to s

ize

of s

endi

ng/re

ceiv

ing

buffe

rs•

Tim

e fo

r sta

rting

MP

I–

late

ncy

–do

es n

ot d

epen

d on

siz

e of

buf

fers

•de

pend

s on

num

ber o

f cal

ling,

incr

ease

s ac

cord

ing

to p

roce

ss #

–O

(100

)-O

(101

) se

c.•

Syn

chro

niza

tion

of M

PI

–In

crea

ses

acco

rdin

g to

num

ber o

f pro

cess

es

S2-

ref

73

Perf

orm

ance

is lo

wer

than

idea

l one

(c

ont.)

•If

com

puta

tion

time

is re

lativ

ely

smal

l (N

is s

mal

l in

S1-

3),

thes

e ef

fect

s ar

e no

t neg

ligib

le.

–If

the

size

of m

essa

ges

is s

mal

l, ef

fect

of “

late

ncy”

is s

igni

fican

t.

S2-

ref

Per

form

ance

is n

ot s

o go

od...

betw

een

1-16

cor

es

•x

7.1

at 1

6 co

res,

bec

ause

of

mem

ory

cont

entio

n–

STR

EA

M–

NO

T m

ainl

y be

caus

e of

co

mm

unic

atio

n ov

erhe

ad

S2-

ref

74

base

d on

per

form

ance

at a

sin

gle

core

1.0

2.0

3.7

5.9

7.1

17.6

42.1

58.3

70.5

80

.7

87.5

1.0

10.0

100.

0

110

100

Speed-Up

Cor

e #

idea

lN

=10^

4N

=10^

6

L1 C

L1 C

L1 C

L1 C

L1 C

L1 C

L1 C

L1 C

L1 C

L1 C

L1 C

L1 C

L1 C

L1 C

L1 C

L1 C

L2

Mem

ory

75

Not

so

sign

ifica

nt in

S1-

3

S1-3•

◆：N

=106

，●

：10

8 ，▲

：10

9 ，－

：Id

eal

•B

ased

on

perfo

rman

ce a

t a s

ingl

e co

re (s

ec.)

•Tr

apez

oida

l rul

e:

requ

irem

ent f

or

mem

ory

is v

ery

smal

l (no

arr

ays)

, N

ON

mem

ory-

boun

d ap

plic

atio

n

S2-

ref

1.0 2.

0

4.0

7.9

15.8

0.0

5.0

10.0

15.0

20.0

05

1015

20

Speed-Up

Cor

e #

idea

lN

=10^

6N

=10^

8N

=10^

9

Sup

er-L

inea

r in

Stro

ng S

calin

g•

In s

trong

sca

ling

case

whe

re e

ntire

pr

oble

m s

ize

is fi

xed,

per

form

ance

is

gen

eral

ly lo

wer

than

the

idea

l one

du

e to

com

mun

icat

ion

over

head

.

•B

ut s

omet

ime,

act

ual p

erfo

rman

ce

may

be

bette

r tha

n th

e id

eal o

ne.

This

is c

alle

d “s

uper

-line

ar”

–on

ly fo

r sca

lar p

roce

ssor

s–

does

not

hap

pen

in v

ecto

r pro

cess

ors

PE

#

Speed-Up

idea

l

actu

al

supe

r-lin

ear

S2-

ref

76

Typ

ical

Behav

iors

0.00

0.50

1.00

1.50

2.00

2.50

3.00 1.

0E+0

41.

0E+0

51.

0E+0

61.

0E+0

7

DO

F: P

robl

em S

ize

GFLOPS

1.0E

-01

1.0E

+00

1.0E

+01

1.0E

+02 1.0E

+04

1.0E

+05

1.0E

+06

1.0E

+07

DO

F: P

robl

em S

ize

GFLOPS

8 %

of p

eak

40 %

of p

eak

S2-

ref

77

Ear

th S

imul

ator

:H

ighe

r per

form

ance

for l

arge

-sca

le

prob

lem

s w

ith lo

nger

loop

s

IBM

-SP

3:H

ighe

r per

form

ance

for s

mal

l pro

blem

s,ef

fect

of c

ache

Str

ong

Scal

ing

Eart

h Si

mul

ator

:P

erfo

rman

ce is

get

ting

wor

se d

ue to

co

mm

unic

atio

n ov

erhe

ad a

nd s

mal

ler l

oop

leng

th if

PE

num

ber i

s la

rger

.

IBM

-SP3

:“S

uper

-line

ar” h

appe

ns if

num

ber o

f PE

is

not s

o la

rge.

Per

form

ance

is g

ettin

g w

orse

du

e to

com

mun

icat

ion

over

head

if P

E

num

ber i

s la

rger

.

S2-

ref

78

PE#

Performance

Idea

l

PE#

Performance

Idea

l

Why

doe

s “S

uper

-Lin

ear”

happ

en ?

•E

ffect

of C

ache

•In

sca

lar p

roce

ssor

s,

perfo

rman

ce fo

r sm

alle

r pr

oble

m is

gen

eral

ly b

ette

r.–

Cac

he is

wel

l-util

ized

.

CP

U

Mai

n M

emor

y

Cac

he

Reg

iste

rFA

ST

SLO

W

S2-

ref

79

S2-

ref

80

Mem

ory

Cop

y is

exp

ensi

ve (1

/2)

neib

#0Se

ndB

ufne

ib#1

neib

#2ne

ib#3

BU

Flen

gth_

eB

UFl

engt

h_e

BU

Flen

gth_

eB

UFl

engt

h_e

expo

rt_i

ndex

[0]

expo

rt_i

ndex

[1]

expo

rt_i

ndex

[2]

expo

rt_i

ndex

[3]

expo

rt_i

ndex

[4]



kk= export_item[k];



tag= 0;




ierr= MPI_Isend

(&SendBuf[iS_e], BUFlength_e, MPI_DOUBLE, NeibPE[neib], 0,



expo

rt_ite

m (e

xpor

t_in

dex[

neib

]:exp

ort_

inde

x[ne

ib+1

]-1)

are

sent

to n

eib-

th n

eigh

bor

Cop

ied

to s

endi

ng b

uffe

rs

S2-

ref

81

Mem

ory

Cop

y is

exp

ensi

ve (1

/2)

neib

#0R

ecvB

ufne

ib#1

neib

#2ne

ib#3

BU

Flen

gth_

iB

UFl

engt

h_i

BU

Flen

gth_

iB

UFl

engt

h_i


tag= 0;




ierr= MPI_Irecv






kk= import_item[k];


}} im

port

_ind

ex[0

]im

port

_ind

ex[1

]im

port

_ind

ex[2

]im

port

_ind

ex[3

]im

port

_ind

ex[4

]

impo

rt_ite

m(im

port_

inde

x[ne

ib]:i

mpo

rt_in

dex[

neib

+1]-1

) ar

e re

ceiv

ed fr

om n

eib-

thne

ighb

or

Cop

ied

from

rece

ivin

g bu

ffer

S2-

ref

82

Sum

mar

y: P

aral

lel F

EM

•P

rope

r des

ign

of d

ata

stru

ctur

e of

dis

tribu

ted

loca

l mes

hes.

•O

pen

Tech

nica

l Iss

ues

–P

aral

lel M

esh

Gen

erat

ion,

Par

alle

l Vis

ualiz

atio

n–

Par

alle

l Pre

cond

ition

er fo

r Ill-

Con

ditio

ned

Pro

blem

s–

Larg

e-S

cale

I/O

83

Dis

trib

uted

Loc

al D

ata

Stru

ctur

e fo

r Pa

ralle

l Com

puta

tion

•D

istri

bute

d lo

cal d

ata

stru

ctur

e fo

r dom

ain-

to-d

oain

co

mm

unic

atio

ns h

as b

een

intro

duce

d, w

hich

is a

ppro

pria

te

for s

uch

appl

icat

ions

with

spa

rse

coef

ficie

nt m

atric

es (e

.g.

FDM

, FE

M, F

VM

etc

.).–

SP

MD

–Lo

cal N

umbe

ring:

Inte

rnal

pts

to E

xter

nal p

ts–

Gen

eral

ized

com

mun

icat

ion

tabl

e

•E

very

thin

g is

eas

y, if

pro

per d

ata

stru

ctur

e is

def

ined

:–

Val

ues

at b

ound

ary

pts

are

copi

ed in

to s

endi

ng b

uffe

rs–

Sen

d/R

ecv

–V

alue

s at

ext

erna

lpts

are

upd

ated

thro

ugh

rece

ivin

g bu

ffers

S2-

ref

84

SE

ND

/RE

CV

(Orig

inal

)sta1=(MPI_Status*)allocate_vector(sizeof(MPI_Status),NEIBPETOT);

sta2=(MPI_Status*)allocate_vector(sizeof(MPI_Status),NEIBPETOT);

req1=(MPI_Request*)allocate_vector(sizeof(MPI_Request),NEIBPETOT);

req2=(MPI_Request*)allocate_vector(sizeof(MPI_Request),NEIBPETOT);

for( neib=1;neib<=NEIBPETOT;neib++){

istart=EXPORT_INDEX[neib-1];

inum

=EXPORT_INDEX[neib]-istart;

for( k=istart;k<istart+inum;k++){

ii= EXPORT_ITEM[k];

WS[k]= X[ii-1];}

MPI_Isend(&WS[istart],inum,MPI_DOUBLE,

NEIBPE[neib-1],0,MPI_COMM_WORLD,&req1[neib-1]);}


istart=IMPORT_INDEX[neib-1];

inum

=IMPORT_INDEX[neib]-istart;

MPI_Irecv(&WR[istart],inum,MPI_DOUBLE,


MPI_Waitall(NEIBPETOT, req2, sta2);



inum



ii = IMPORT_ITEM[k];

X[ii-1]= WR[k];}

} MPI_Waitall(NEIBPETOT, req1, sta1);

S2-

ref

85

Com

bine

MPI

_Wai

tall’

ssta1=(MPI_Status*)allocate_vector

(sizeof(MPI_Status), 2*NEIBPETOT);

req1=(MPI_Request*)allocate_vector(sizeof(MPI_Request),2*NEIBPETOT);



inum



ii= EXPORT_ITEM[k];

WS[k]= X[ii-1];}





inum


MPI_Irecv(&WR[istart],inum,MPI_DOUBLE,

NEIBPE[neib-1],0,MPI_COMM_WORLD,&req1[NEIBPETOT+neib-1]);}

MPI_Waitall(2*NEIBPETOT, req1, sta1);



inum



ii = IMPORT_ITEM[k];

X[ii-1]= WR[k];}

}

86

If nu

mbe

ring

of e

xter

nal n

odes

is

cont

inuo

us in

eac

h ne

ighb

orin

g pr

oces

s ...

12

34

67

85

910

1112

1415

1613

1718

1920

2223

2421

2526

2728

3031

3229

3334

3536

3839

4037

4142

4344

4647

4845

4950

5152

5455

5653

5758

5960

6263

6461

6566

6768

7071

7269

7576777879807473

8481

8582

8688

8783

8990919293949596

87

SE

ND

/RE

CV

(NE

W:1

)sta1=(MPI_Status*)allocate_vector(sizeof(MPI_Status),2*NEIBPETOT);




inum



ii= EXPORT_ITEM[k];

WS[k]= X[ii-1];}





inum

=NOD_IMPORT[IMPORT_INDEX[neib-1]];

MPI_Irecv(&X[istart],inum,MPI_DOUBLE,



88

SE

ND

/RE

CV

(NE

W:2

), N

0: in

t. no

de #

sta1=(MPI_Status*)allocate_vector(sizeof(MPI_Status),2*NEIBPETOT);




inum



ii= EXPORT_ITEM[k];

WS[k]= X[ii-1];}





inum

=IMPORT_INDEX[neib-1] + N0;

MPI_Irecv(&X[istart],inum,MPI_DOUBLE,



N0:

# o

f Int

erna

l Nod

es内

点総

数

report s2 - 東京大学nkl.cc.u-tokyo.ac.jp/14w/cw-mpi/s2-ref.pdf · report s2 kengo nakajima...

Documents