Программирование для intel xeon phi
Post on 04-Jan-2016
86 Views
Preview:
DESCRIPTION
TRANSCRIPT
4
4 Intel Xeon Phi .., .. Intel . .. 1, ? , . *
. .*
*
#. , 2013 .
Xeon Xeon Phi:
#. , 2013 .
1. #. , 2013 .
.: . . Intel Xeon Intel Xeon Phi. .
#. , 2013 .
2 Intel Xeon E5-2690 (2.9 GHz)2 Intel Xeon Phi 7110X (61 )64 GB Linux CentOS 6.2, Intel Parallel Studio XE 2013 SP1#. , 2013 .
2. #. , 2013 .
- (1) (2) - t - t - ( ) - ( ) - ( ) - (E=0 P=1, , Wt Ws ~ N(0, t-s), s < t), Wt () P=1. - .
#. , 2013 .
-
. , S ( ). , S ( ). , .
#. , 2013 .
-,
:
(3)
( ) (, -), Wt N(0, t).
#. , 2013 .
?
tS , Wt - . 1 , .
#. , 2013 .
P1 P2, P2 t P1 P1 K, . P2 () C P1. K (, strike price), C .
#. , 2013 .
- . P1 P2. C T ( , maturity, ) : K . ST K. ST < K, , C, C. ST > K, K, ( C ST K).
#. , 2013 .
, / . P2: (4)
T , (1. t = T 1. t = 0)#. , 2013 .
- . t=0 (F ):
(5)#. , 2013 .
3. #. , 2013 .
. , , , . , , . .
#. , 2013 .
#. , 2013 .
4. #. , 2013 .
(AoS Array of Structures)
(SOA Structure of Arrays): , . .#. , 2013 .
. int numThreads = 1;int N = 60000000;int main(int argc, char *argv[]){ int version; if (argc < 2) { printf("Usage: size version [#of_threads]\n"); return 1; } N = atoi(argv[1]); version = atoi(argv[2]); if (argc > 3) numThreads = atoi(argv[3]);
//
float res = GetOptionPrice(); printf("%.8f;\n", res); return 0;}#. , 2013 .
. const float sig = 0.2f;const float r = 0.05f;const float T = 3.0f;const float S0 = 100.0f;const float K = 100.0f;
float GetOptionPrice() { float C, d1, d2, p1, p2; d1 = (logf(S0 / K) + (r + sig * sig * 0.5f) * T) / (sig * sqrtf(T)); d2 = (logf(S0 / K) + (r - sig * sig * 0.5f) * T) / (sig * sqrtf(T)); p1 = cdfnormf(d1); p2 = cdfnormf(d2); C = S0 * p1 - K * expf((-1.0f) * r * T) * p2; return C;}#. , 2013 .
. __declspec(noinline) void GetOptionPricesV0( float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, p1, p2; for (i = 0; i < N; i++) { d1 = (log(pS0[i] / pK[i]) + (r + sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); d2 = (log(pS0[i] / pK[i]) + (r - sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); p1 = cdfnormf(d1); p2 = cdfnormf(d2); pC[i] = pS0[i] * p1 - pK[i] * exp((-1.0) * r * pT[i]) * p2; }}#. , 2013 .
. __declspec(noinline) void GetOptionPricesV0( float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, p1, p2; for (i = 0; i < N; i++) { d1 = (log(pS0[i] / pK[i]) + (r + sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); d2 = (log(pS0[i] / pK[i]) + (r - sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); p1 = cdfnormf(d1); p2 = cdfnormf(d2); pC[i] = pS0[i] * p1 - pK[i] * exp((-1.0) * r * pT[i]) * p2; }}#. , 2013 .
. N60000000120000000180000000240000000 V0 ()17,00234,00451,00867,970 !
#. , 2013 .
1. __declspec(noinline) void GetOptionPricesV0( float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, p1, p2; for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); p1 = cdfnormf(d1); p2 = cdfnormf(d2); pC[i] = pS0[i] * p1 - pK[i] * expf((-1.0f) * r * pT[i]) * p2; }}#. , 2013 .
1. N60000000120000000180000000240000000 V0()17,00234,00451,00867,970 V1()16,77633,54950,33766,989 ? 3
#. , 2013 .
2. : cdfnorm() vs. erf() erff() cdfnormf(), .
: ?
#. , 2013 .
2. : cdfnorm() vs. erf()__declspec(noinline) void GetOptionPricesV2(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2; for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .
2. : cdfnorm() vs. erf()N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 !
#. , 2013 .
3. : restrict ?
restrict?
restrict?
restrict, . ?#. , 2013 .
3. : restrict ? vec-report3 vec-report6 (Linux) Qvec-report3 Qvec-report6 (Windows) mavx ( SSE, AVX).
, ? , .
#. , 2013 .
3. : restrict__declspec(noinline) void GetOptionPricesV3( float * restrict pT, float * restrict pK, float * restrict pS0, float * restrict pC) { int i; float d1, d2, erf1, erf2; for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .
4. : simd__declspec(noinline) void GetOptionPricesV4(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma simd for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .
4*. : ivdep vector always__declspec(noinline) void GetOptionPricesV4(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma ivdep#pragma vector always for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .
3-4. N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,0671. : loop was vectorized (SIMD loop was vectorized)2. 3 , . 3. 8 , 5,43. .. 2 3!
#. , 2013 .
__declspec(noinline) void GetOptionPricesV4(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma simd // Intel . ivdep for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}5.43 #. , 2013 .
5. const float invsqrt2 = 0.707106781f;__declspec(noinline) void GetOptionPricesV5(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma simd for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 * invsqrt2); erf2 = 0.5f + 0.5f * erff(d2 * invsqrt2); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }} . ? , #. , 2013 .
5. N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085
#. , 2013 .
6. . __declspec(noinline) void GetOptionPricesV6(float *pT, float *pK, float *pS0, float *pC) { int i; float d1, d2, erf1, erf2, invf; float sig2 = sig * sig;#pragma simd for (i = 0; i < N; i++) { invf = invsqrtf(sig2 * pT[i]); d1 = (logf(pS0[i] / pK[i]) + (r + sig2 * 0.5f) * pT[i]) * invf; d2 = (logf(pS0[i] / pK[i]) + (r - sig2 * 0.5f) * pT[i]) * invf; erf1 = 0.5f + 0.5f * erff(d1 * invsqrt2); erf2 = 0.5f + 0.5f * erff(d2 * invsqrt2); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .
6. . N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 . .
#. , 2013 .
6.1. SSE: 16, AVX: 32, Xeon Phi: 64memalign() -> __mm_malloc()Windows: __declspec(align(XX)) float T[N];Linux: float T[N] __attribute__((aligned(64)));#pragma vector aligned, __assume_aligned, __assumeint main(int argc, char *argv[]){ pT = (float *)memalign(32, 4 * N * sizeof(float));// pT = new float[4 * N]; ... free(pT);// delete [] pT; return 0;} #. , 2013 .
6.2. .icc ... -fimf-precision=low -fimf-domain-exclusion=31 N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 V6.10,5391,0721,6172,135 V6.20,4380,8711,3141,724#. , 2013 .
7. #pragma omp parallel for private(invf, d1, d2, erf1, erf2)N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 V6.10,5391,0721,6172,135 V6.20,4380,8711,3141,724 V7(16 )0,0580,0840,1260,153#. , 2013 .
7.1. . .
: ?#. , 2013 .
7.1. N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 V6.10,5391,0721,6172,135 V6.20,4380,8711,3141,724 V70,0580,0840,1260,153 V6.30,4090,8121,2261,603 V7.10,0330,0620,0910,118#. , 2013 .
7.1. 7.1 6.3 , 7 6.2, 12.54 (60 . ) 13,61 (240 . ). 6.3 7,5% , 6.2, 7.1 , 7 ( 60 . , 70%, ).#. , 2013 .
Xeon Phi , 7. , , . , 6, Xeon. Xeon Phi -mmic. memalign() 32 64. . .
#. , 2013 .
Xeon Phi. Xeon Phi ( 6.2) 23%, , 2,3 . ( 60%) , , , . 6.3. . N60000000120000000180000000240000000 V61,5443,0894,6336,174 V6.11,5453,0914,6346,179 V6.20,6761,3522,0272,703 V6.30,4220,8451,2691,690#. , 2013 .
Xeon Phi. N60000000120000000180000000240000000 V70,1340,1490,1640,175S(V6.2/V7)5,03369,05012,33115,437 V7.10,0080,0170,0250,033S(V6.3/V7.1)50,58551,17851,78351,546N60000000120000000180000000240000000 V70,2340,2550,2570,255S(V6.2/V7)2,8855,3037,88310,590 V7.10,0070,0140,0210,028S(V6.3/V7.1)59,42259,58760,38959,839N60000000120000000180000000240000000 V70,5320,5270,5330,558S(V6.2/V7)1,2692,5643,8004,842 V7.10,0080,0160,0240,031S(V6.3/V7.1)53,28654,24853,96953,96460120240#. , 2013 .
Xeon Phi. 7 . , . ( 7.1) ( 50,5 60,4). 120 , 60 , Xeon Phi.
- ?#. , 2013 .
8. - 4 (pT, pK, PS0, pC). 3 , (pC) . , pC, . , ( pC nontemporal data). , , , nontemporal data, streaming stores, .#pragma vector nontemporal#. , 2013 .
8. -N60000000120000000180000000240000000 V80,0090,0180,0270,035S(V6.3/V8)46,87847,50547,61047,832N60000000120000000180000000240000000 V80,0070,0130,0190,026S(V6.3/V8)63,87365,04865,42665,887N60000000120000000180000000240000000 V80,0070,0130,0190,026S(V6.3/V8)63,22264,76865,37965,42060120240 54 65.
#. , 2013 .
Xeon vs. Xeon PhiN60000000120000000180000000240000000Xeon0,0300,0610,0900,116Xeon Phi0,0070,0130,0190,026#. , 2013 .
, . , Call Put. , Xeon Xeon Phi, , Xeon Phi . Xeon Phi . , .#. , 2013 .
. , 2. . 3- . .: , 2007. . 832. .. . , 2004. 1076. .., .., .., .. . . . 4 . : - , 2013. 1394 .#. , 2013 .
, ..., , . . meerov@vmk.unn.ru ,..., sysoyev@vmk.unn.ru
#. , 2013 .
12.012.42.882.62.542.362.242.832.572.651.342.482.472.532.142.832.292.142.091.82.242.332.942.882.562.172.092.562.82.352.772.62.212.992.152.492.62.42.022.142.212.12.592.742.972.051.42.262.852.452.332.692.642.72.91.82.352.462.012.492.822.122.562.913.82.842.322.752.672.962.242.42.062.12.162.282.242.52.172.122.632.583.42.482.722.012.592.042.712.8821.82.532.662.882.962.012.142.522.213.82.282.472.752.052.372.392.12.082.092.282.522.992.632.592.952.362.732.952.722.862.832.752.332.462.022.1232.192.932.942.042.152.482.312.752.22.742.432.252.282.6132.662.012.882.152.262.562.352.472.692.152.352.852.112.622.822.762.442.042.562.432.52.012.072.922.552.72.342.322.022.22.172.892.372.382.722.572.45
1 2 3 4 5 6 7 8 9
1 1 2 3 4 5 6 7 8 902.012.42.882.62.542.362.242.832.57t12.651.342.482.472.532.142.832.292.14t22.091.82.242.332.942.882.562.172.09t32.562.82.352.772.62.212.992.152.49t42.62.42.022.142.212.12.592.742.97t52.051.42.262.852.452.332.692.642.7t62.91.82.352.462.012.492.822.122.56t72.913.82.842.322.752.672.962.242.4t82.062.12.162.282.242.52.172.122.63t92.583.42.482.722.012.592.042.712.88t1021.82.532.662.882.962.012.142.52t112.213.82.282.472.752.052.372.392.1t122.082.092.282.522.992.632.592.952.36t132.732.952.722.862.832.752.332.462.02t142.1232.192.932.942.042.152.482.31t152.752.22.742.432.252.282.6132.66t162.012.882.152.262.562.352.472.692.15t172.352.852.112.622.822.762.442.042.56t182.432.52.012.072.922.552.72.342.32T2.022.22.172.892.372.382.722.572.45
top related