vector-parallel modernization tim prince phd (me) intel black belt software developer sept. 24, 2015

Vector-Parallel Modernization

Tim Prince PhD (ME)Intel Black Belt Software Developer

Sept. 24, 2015

Introduction

• This presentation shows how to optimize some difficult loops with Intel compilers for Fortran, C, and C++. Examples are selected from the classic netlib.org vector benchmark, cases which are not optimized automatically by current Intel compilers, but exhibit good vector or parallel performance after modernization. This format shows a variety of applications and the relationship among the source languages.

Resolve suspected anti-dependence by omp4• equivalence(array(64),x(1))• ! this example has true anti-dependence: stores 64 elements beyond load• ! illustrates a pitfall of equivalence and equivalent C pointer overlaps• #if _OPENMP >= 201307• !$omp simd safelen(32)• ! this allows 2 cache lines of read ahead for 32-bit data type• #endif• do i= 1,n-1• x(i+1)= array(i)+a(i)• enddo

Anti-dependence example in C and C++• #define x ((real *)&cdata_1 + 63)• i2 = *n - 1;• #if _OPENMP >= 201307• #pragma omp simd safelen(32)• #endif• for (int i = 1; i <= i2; ++i)• cdata_1.array[i] = x[i - 1] + a[i];• // C++ :• // no pragma needed by g++• #pragma ivdep• transform(&x[0],&x[i2],&a[1],&cdata_1.array[1],plus<float>());

False suspected anti-dependence

• do i= 1,n-1• if(a(i) < 0.)then• ! order as in netlib public source• ! if(b(i) < 0.) a(i)= a(i)+c(i)*d(i)• ! b(i+1)= c(i)+d(i)*e(i)• ! switch order to remove vector dependence• b(i+1)= c(i)+d(i)*e(i)• if(b(i) < 0.) a(i)= a(i)+c(i)*d(i)• endif• enddo

C false anti-dependence avoided

// no pragma needed if pointers are qualified by __restrict for (int i = 1; i <= i2; ++i)

if (a[i] < 0.f) { b[i + 1] = c[i] + d[i] * e[i]; if (b[i] < 0.f)

a[i] += c[i] * d[i]; }

Optimize circular loop carried dependency(avoid f90)• ! a(:n)= (b(:n)+cshift(b(:n),1)+cshift(b(:n),2))*.333• x= b(n)• y= b(n-1)• !$omp simd• do i= 1,n• a(i)= (b(i)+x+y)*.333• y= x• x= b(i)• enddo

C pragma circular dependency optimization• x = b[*n];• y = b[*n - 1];• i2 = *n;• #pragma omp simd• for (int i = 1; i <= i2; ++i) {• a[i] = (b[i] + x + y) * .333f;• y = x;• x = b[i];• }

Partial read after write dependency

• ! do i= 1,n ! Hidden partial dependency• ! x= a(n-i+1)+b(i)*c(i)• ! a(i)= x-1.0• ! b(i)= x• ! enddo• ! resolve by separating the dependencies• b(1:(n+1)/2)= a(n:n/2+1:-1)+b(1:(n+1)/2)*c(1:(n+1)/2)• ! ifort fuses here at -O3• a(1:(n+1)/2)= b(1:(n+1)/2)-1.0• b((n+3)/2:n)= a(n/2:1:-1)+b((n+3)/2:n)*c((n+3)/2:n)• a((n+3)/2:n)= b((n+3)/2:n)-1.0

Resolve false assumed WAR by C omp4(explicit fusion creates false WAR dependence)

• #pragma omp simd• for (int i= 1; i <= (i2+1)/2; ++i)• a[i] = (b[i] = a[i2 - i + 1] + b[i] * c[i])- 1.f;• #pragma omp simd• for (int i= (i2+3)/2; i <= i2; ++i)• a[i] = (b[i] = a[i2 - i + 1] + b[i] * c[i])- 1.f;

Vectorize by splitting search and compute• ! i= 1• ! do while (a(i) >= 0.) ! Not vectorized• ! a(i)= a(i)+b(i)*c(i)• ! i= i+1• ! enddo• ! no more old-fashioned explicit masking• do i= 1,n• if(a(i) < 0) exit• enddo• a(:i-1)= a(:i-1)+b(:i-1)*c(:i-1)

C vectorized linear search and compute• i2 = *n;• // first i has scope outside for• for (i = 1; i <= i2; ++i)• if (a[i] < 0.f) break;• i2 = i - 1;• // this one needs * __restrict a or pragma• for (int i = 1; i <= i2; ++i)• a[i] += b[i] * c[i];

Overcome “protects against exception”by taking arithmetic outside, and directive• ! do i= 1,n• ! if(d(i) < 0)then• ! a(i)= a(i)+b(i)*c(i)• ! else• ! if(d(i).ne.0)then• ! a(i)= a(i)+c(i)*c(i)• ! else• ! a(i)= a(i)+b(i)*b(i)• ! endif• ! endif• ! enddo• !dir$ vector aligned• a(:n)= a(:n)+merge(b(:n),c(:n),d(:n)<=0)*merge(c(:n),b(:n),d(:n)/=0)

C avoidance of “protects against exception”• #pragma vector aligned• // using __restrict (or another pragma)• for (int i = 1; i <= i2; ++i)• a[i] +=(d[i] <= 0.f?b[i]:c[i]) * (d[i]==0.f?b[i]:c[i]);

linear search not optimizedifort doesn’t resolve 2 level reduction• max= aa(1,1)• xindex= 1• yindex= 1• do j= 1,n• do i= 1,n• if(aa(i,j) > max)then• max= aa(i,j)• xindex= i• yindex= j• endif• enddo• enddo

Parallel-vector linear search

• max_= aa(1,1)• xindex=1• yindex=1• !$omp parallel do private(ml) if(n>103) reduction(max: max_) &• !$omp& lastprivate(xindex,yindex)• do j=1,n• ml= maxloc(aa(:n,j),dim=1)• if(aa(ml,j)>max_ .or. aa(ml,j)==max_ .and. j<yindex)then• xindex= ml• yindex= j• max_=aa(ml,j)• endif• enddo

C parallel-vector linear search

• max__ = aa[aa_dim1 + 1];• xindex = yindex = 1;• i2 = i3 = *n;• #pragma omp parallel for if(i2 > 103) reduction(max: max__) lastprivate(xindex,yindex)• for (int j = 1; j <= i2; ++j) {• int indxj=0;• float maxj=max__;• #pragma omp simd reduction(max: maxj) lastprivate(indxj)• for (int i = 1; i <= i3; ++i) if (aa[i + j * aa_dim1] > maxj){• maxj = aa[i + j * aa_dim1];• indxj = i; }• if(maxj > max__) { // fixme: take care of the case of ties• max__= maxj;• xindex=indxj;• yindex=j;}}

Parallel vector convolution

• #if defined __INTEL_COMPILER• !$omp parallel do if(n>103)• do i= 1,m • a(i)= a(i)+dot_product(b(i:i+m-1),c(m:1:-1))• #else• ! single thread version (slightly less accurate)• Ifort auto-parallel does an array reduction, it’s OK for small no. cores• do j= 1,m• a(:m)= a(:m)+b(1+m-j:m+m-j)*c(j)• #endif• enddo

C parallel vector convolution

• #pragma omp parallel for if(i3 > 103)• for (int i = 1; i <= i3; ++i) {• float sum = 0;• #pragma omp simd reduction(+: sum)• for (int j = 1; j <= i2; ++j) • sum += b[i + j - 1] * c[i2 - j + 1];• a[i] += sum;• }

C++ parallel vector convolution

• // and here's a C++ version, which doesn't need AVX2 to optimize• // reverse the vector which is used repeatedly• vector<float> Cr(m);• reverse_copy(&c[1],&c[i3]+1,Cr.begin());• // It won't optimize with /Qprotect-parens (investigation requested)• #pragma omp parallel for if(i3 > 103)• for (int i = 1; i <= i3; ++i) • a[i] += inner_product(Cr.begin(),Cr.end(),&b[i],0.f);

False indexing dependency, no optimization• k = 1• do 10 i = 1,n• do 20 j = 2,n• bb(i,j) = bb(i,j-1) + array(k) * cc(i,j)• k = k + 1• 20 continue• k = k + 1• 10 continue

Optimize by making inner loops independent• !$omp parallel do private(k) if(n>103)• do i= 1,n• k= i*n+1-n• do j= 2,n• bb(i,j)= bb(i,j-1)+array(k)*cc(i,j)• k= k+1• enddo• enddo• ! version for single core• do j= 2,n• bb(:n,j)= bb(:n,j-1)+array(j-1:n*n+j-1:n)*cc(:n,j)• enddo

Loop nesting not corrected due to indexing• ! do 10 i = 1,n• ! k = i*(i-1)/2+i• ! do 20 j = i,n• ! array(k) = array(k) + bb(i,j)• ! k = k + j• ! 20 continue• ! 10 continue• ! swap loops for inner loop data locality

do j= 1,n k= j*(j-1)/2 array(k+1:k+j)= array(k+1:k+j)+bb(:j,j) enddo

! That's good enough for single CPU auto-parallel auto-vectorizer

another auto-renesting failure

• ! do 30 i = 2,n• ! do 20 j = 2,n• ! aa(i,j) = aa(i,j-1) + cc(i,j)• ! 20 continue• ! do 30 j = 2,n• ! bb(i,j) = bb(i-1,j) + cc(i,j)• ! 30 continue• do j= 2,n• do i= 2,n• aa(i,j)= aa(i,j-1)+cc(i,j)• bb(i,j)= bb(i-1,j)+cc(i,j)• enddo• enddo

Explicit parallel C code

• #pragma omp parallel if(i2 > 53) {• #pragma omp for nowait• // setting up to proceed to the next loop when some cores finish here• #pragma novector• for (int j = 2; j <= i3; ++j)• for (int i = 2; i <= i2; ++i)• bb[i + j * bb_dim1] = bb[i - 1 + j*bb_dim1] + cc[i + j * cc_dim1];• #pragma omp for simd• for (int i = 2; i <= i2; ++i)• for (int j = 2; j <= i3; ++j)• aa[i + j * aa_dim1] = aa[i + (j - 1) * aa_dim1] + cc[i + j * cc_dim1];• }

Fallacy: compilers always optimize out of loop• ! do 10 i = 1,n-1• ! a(i) = b(i) + c(i) * d(i)• ! b(i) = c(i) + b(i)• ! a(i+1) = b(i) + a(i+1) * d(i)• ! 10 continue• do i= 1,n-1• a(i)= b(i)+c(i)*d(i)• b(i)= c(i)+b(i)• enddo• a(n)= b(n-1)+a(n)*d(n-1)

Repeated update of 1D array in 2D loop• ! do 10 i = 1,n• ! do 20 j = 2,n• ! a(j) = aa(i,j) - a(j-1)• ! aa(i,j) = a(j) + bb(i,j)• ! 20 continue• ! 10 continue• ! so store only the final values of a(:):• do j= 2,n• aa(:n,j)= aa(:n,j)+bb(:n,j)-a(j-1)• a(j)=aa(n,j)-bb(n,j)• enddo

Non-vectorizable 1D array removal

• ! loop swap should have been obvious• do 10 i = 2,n• do 20 j = 1,n• a(i) = aa(i,j) - a(i-1)• aa(i,j) = a(i) + bb(i,j)• 20 continue• 10 continue

it’s parallelizable, relatively tedious• !$omp parallel if(n>103)• !$omp do private(tmp)• do j= 1,n-1• tmp= a(1)• do i= 2,n• tmp= aa(i,j)-tmp• aa(i,j)= tmp+bb(i,j)• enddo• enddo• !$omp end do nowait• !$omp single• do i= 2,n• a(i)= aa(i,n)-a(i-1)• aa(i,n)= a(i)+bb(i,n)• enddo• !$omp end single• !$omp end parallel

Showing the C with the final loop first• #pragma omp parallel if(i2 > 103){• #pragma omp single• for (int i = 2; i <= i2; ++i) {• a[i] = aa[i + i3 * aa_dim1] - a[i - 1];• aa[i + i3 * aa_dim1] = a[i] + bb[i + i3 * bb_dim1];• }• #pragma omp for nowait• for (int j = 1; j < i3; ++j){• float tmp= a[1];• for (int i = 2; i <= i2; ++i) {• tmp = aa[i + j * aa_dim1] - tmp;• aa[i + j * aa_dim1] = tmp + bb[i + j * bb_dim1];• }}}

Save embarrassing one for last

• do 10 i = 1,n• a(i) = a(1)• 10 continue• ! don't over-write the rhs in a potentially recursive manner • ! (even by same value) unless by array assignment;• a(:n)= a(1)• In case you wondered, it’s the same between C and CEAN• a[1:i2] = a[1]; // is OK even though over-writing a[1]

Conclusions

• Auto-vectorization often needs help• Parallelization often needs more explicit “modernization”• Use tools to identify where to “modernize:”• Vtune, Advisor

vector-parallel modernization tim prince phd (me) intel black belt software developer sept. 24, 2015

Documents

good vector

vector benchmark

pragma omp simd safelen32

true antidependence

parallel performance

equivalent c pointer

pragma novector

current intel compilers