code vectorization for mobile devices

by Dmitriy Vovk

CODE VECTORIZATIONfor mobile devices

Hardware• Typical hardware found in modern mobile

devices:– ARMv7 instructions set– Cortex A8\Cortex A9\Custom cores (Krait, Swift)– 800 – 1500 MHz– 1-4 cores– Thumb-2 instructions set– VFPv3– NEON, optional for Cortex A9. Nvidia Tegra 2 has

no NEON support

NEON• NEON is a general purpose SIMD engine

designed by ARM for ARM processor architecture

• 16 registers, 128 bit wide each. Supports operations on 8, 16, 32 and 64 bits integers and 32 bits float values

NEON• NEON can be used for:– Software geometry instancing;– Skinning on ES 1.1;– As a general vertex processor;– Other, typical, applications for SIMD.

NEON• Some unified shader architectures, like

popular Imagination Technologies USSE1 (PowerVR SGX 530-545) are scalar, NEON is vector by nature. Move your vertex processing to CPU from GPU to speedup calculations*

• ???????• PROFIT!!!111

• *NOTE. That doesn’t apply to USSE2 hardware

NEON• The weakest side of mobile GPUs is a fill rate.

Fill rate is quickly killed by blending. 2D games are heavy on this. PowerVR USSE engine doesn’t care what to do – vertex or fragments processing. Moving you vertex processing to CPU (NEON) will leave some room space for fragment processing.

NEON• There are 3 ways to use NEON vectorization in

your code:1. Intrinsics2. Handwritten NEON assembly3. Autovectorization by compiler. –mllvm –vectorize

–mllvm –bb-vectorize-aligned-only compiler flags for LLVM. -ftree-vectorizer-verbose=4 -mfpu=neon -funsafe-math-optimizations -ftree-vectorize for GCC

Measurements• Intrinsics:

Measurements• Assembly :

Measurements• Summary:

• Intrinsics got me 25% speedup over assembly. • Note that speed of intrinsics code vary from

compiler to compiler.

Running time, ms CPU usage, %

Intrinsics 2764 19

Assembly 3664 20

FPU 6209 25-28

FPU autovectorized 5028 22-24

NEON• Intrinsics advantages over assembly:– Higher level code;– No need to manage registers;– You can vectorize basic blocks and build solution

to every new problem with this blocks. In contrast to assembly – you have to solve each new problem from scratch;

NEON• Assembly advantages over intrinsics:– Code generated from intrinsics vary from compiler

to compiler and can give you really big difference in speed. Assembly code will always be the same.

Codevoid Update() { GLKMatrix4 modelviewMat = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 };

const float Y_DELTA = 420.0f / QUADS_COUNT;

for (int i = 0; i < QUADS_COUNT * VERTS_PER_QUAD; i += VERTS_PER_QUAD) { modelviewMat.m[12] = random() % 260; modelviewMat.m[13] = Y_DELTA ;#ifdef ASM CalculateSpriteVertsWorldPos((float32x4x4_t*)proj.m, (float32x4x4_t*)modelviewMat.m, (float32x4_t*)&data[i + 0].pos, (float32x4_t*)&data[i + 1].pos, (float32x4_t*)&data[i + 2].pos, (float32x4_t*)&data[i + 3].pos);#else float32x4x4_t modelviewProj; Matrix4ByMatrix4((float32x4x4_t*)proj.m, (float32x4x4_t*)modelviewMat.m, &modelviewProj); for (int j = 0; j < 4; ++j) { Matrix4ByVec4(&modelviewProj, (float32x4_t*)&squareVertices[j], (float32x4_t*)&data[i + j].pos); }#endif } glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer); glBufferData(GL_ARRAY_BUFFER, sizeof(data), data, GL_STREAM_DRAW);}

Code__attribute__((always_inline)) void Matrix4ByVec4(const float32x4x4_t* __restrict__ mat, const float32x4_t* __restrict__ vec, float32x4_t* __restrict__ result){ (*result) = vmulq_n_f32((*mat).val[0], (*vec)[0]); (*result) = vmlaq_n_f32((*result), (*mat).val[1], (*vec)[1]); (*result) = vmlaq_n_f32((*result), (*mat).val[2], (*vec)[2]); (*result) = vmlaq_n_f32((*result), (*mat).val[3], (*vec)[3]);}

Code__attribute__((always_inline)) void Matrix4ByMatrix4(const float32x4x4_t* __restrict__ m1, const float32x4x4_t* __restrict__ m2, float32x4x4_t* __restrict__ r){#ifdef INTRINSICS (*r).val[0] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[0], 0)); (*r).val[1] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[1], 0)); (*r).val[2] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[2], 0)); (*r).val[3] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[3], 0)); (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[1], vgetq_lane_f32((*m2).val[0], 1)); (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[1], vgetq_lane_f32((*m2).val[1], 1)); (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[1], vgetq_lane_f32((*m2).val[2], 1)); (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[1], vgetq_lane_f32((*m2).val[3], 1)); (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[2], vgetq_lane_f32((*m2).val[0], 2)); (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[2], vgetq_lane_f32((*m2).val[1], 2)); (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[2], vgetq_lane_f32((*m2).val[2], 2)); (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[2], vgetq_lane_f32((*m2).val[3], 2)); (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[3], vgetq_lane_f32((*m2).val[0], 3)); (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[3], vgetq_lane_f32((*m2).val[1], 3)); (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[3], vgetq_lane_f32((*m2).val[2], 3)); (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[3], vgetq_lane_f32((*m2).val[3], 3));}

Code __asm__ volatile ( "vldmia %6, { q0-q3 } \n\t" "vldmia %0, { q8-q11 }\n\t" "vmul.f32 q12, q8, d0[0]\n\t" "vmul.f32 q13, q8, d2[0]\n\t" "vmul.f32 q14, q8, d4[0]\n\t" "vmul.f32 q15, q8, d6[0]\n\t" "vmla.f32 q12, q9, d0[1]\n\t" "vmla.f32 q13, q9, d2[1]\n\t" "vmla.f32 q14, q9, d4[1]\n\t" "vmla.f32 q15, q9, d6[1]\n\t" "vmla.f32 q12, q10, d1[0]\n\t" "vmla.f32 q13, q10, d3[0]\n\t" "vmla.f32 q14, q10, d5[0]\n\t" "vmla.f32 q15, q10, d7[0]\n\t"

"vmla.f32 q12, q11, d1[1]\n\t" "vmla.f32 q13, q11, d3[1]\n\t" "vmla.f32 q14, q11, d5[1]\n\t" "vmla.f32 q15, q11, d7[1]\n\t" "vldmia %1, { q0-q3 } \n\t" "vmul.f32 q8, q12, d0[0]\n\t" "vmul.f32 q9, q12, d2[0]\n\t" "vmul.f32 q10, q12, d4[0]\n\t" "vmul.f32 q11, q12, d6[0]\n\t" "vmla.f32 q8, q13, d0[1]\n\t" "vmla.f32 q8, q14, d1[0]\n\t" "vmla.f32 q8, q15, d1[1]\n\t" "vmla.f32 q9, q13, d2[1]\n\t" "vmla.f32 q9, q14, d3[0]\n\t" "vmla.f32 q9, q15, d3[1]\n\t"

"vmla.f32 q10, q13, d4[1]\n\t" "vmla.f32 q10, q14, d5[0]\n\t" "vmla.f32 q10, q15, d5[1]\n\t" "vmla.f32 q11, q13, d6[1]\n\t" "vmla.f32 q11, q14, d7[0]\n\t" "vmla.f32 q11, q15, d7[1]\n\t" "vstmia %2, { q8 }\n\t" "vstmia %3, { q9 }\n\t" "vstmia %4, { q10 }\n\t" "vstmia %5, { q11 }" : : "r" (proj), "r" (squareVertices), "r" (v1), "r" (v2), "r" (v3), "r" (v4), "r" (modelView) : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" );

Docs• For detailed explanation on intrinsics\

assembly see: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491e/CIHJBEFE.html

http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491e/CIHJBEFE.html

http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491e/CIHJBEFE.html

Contact me

http://www.linkedin.com/in/dvovk/http://nukecode.blogspot.com/

mailto:[email protected]

mailto:[email protected]