Евгений Крутько, Многопоточные вычисления,...
TRANSCRIPT
C++ User Group Russia, Ekaterinburg, 25.11.2016
Многопоточные вычисления, современный подход
Крутько Е.С.НИЦ «Курчатовский институт»
Цель доклада
Современный стандарт C++ позволяет делать многопоточные вычисления легко и удобно.
Если приложение или библиотека тратит меньше времени на работу – это хорошо
Исходники тестов:https://github.com/eskrut/multithread.git
2
Немного истории. Нативные потоки
3
#include <stdlib.h>#include <pthread.h>#include <assert.h>
void* someParallelTask(void *arg) { int *preciousValue = reinterpret_cast<int*>(arg); *preciousValue = 1;}
int main(int argc, char**argv) {
pthread_t thread; int value, othreValue;
pthread_create(&thread, nullptr, someParallelTask, &value); someParallelTask(&othreValue); pthread_join(thread, nullptr); assert(value == othreValue);
return 0;}
#include <stdlib.h>#include <windows.h>#include <assert.h>
DWORD WINAPI someParallelTask(void *arg) { int *preciousValue = reinterpret_cast<int*>(arg); *preciousValue = 1;}
int main(int argc, char**argv) {
HANDLE thread; int value, othreValue;
thread = CreateThread( NULL, 0, someParallelTask, &value, 0, NULL ); someParallelTask(&othreValue); WaitForSingleObject(thread, INFINITE); assert(value == othreValue);
return 0;}
Стандартные потоки. с++11
4
#include <stdlib.h>#include <thread>#include <assert.h>
void someParallelTask(int &value) { value = 1;}
int main(int argc, char**argv) {
std::thread thread; int value, othreValue;
thread = std::thread( someParallelTask, std::ref(value)); someParallelTask(othreValue); thread.join(); assert(value == othreValue);
return 0;}
Threadpool. с++11
5
#include <stdlib.h>#include <assert.h>
//git submodule add https://github.com/progschj/ThreadPool.git#include "ThreadPool/ThreadPool.h"
int someExample();int someExampleParallel();
int main(int argc, char**argv) {
someExample();
someExampleParallel();
return 0;}
Threadpool. с++11
6
int someExample() { int value1; //code to evaluate value1 //may require significant amount of time value1 = 3; int value2; //code to evaluate value2 //may require significant amount of time value2 = 3;
//Now use some fancy algorythm using values int result = value1 + value2; assert(result == 6);
return result;}
Threadpool. с++11
7
int someExampleParallel() { ThreadPool pool(8);
auto futureValue1 = pool.enqueue([](){ int value1; //code to evaluate value1 //may require significant amount of time value1 = 3; return value1; });
auto futureValue2 = pool.enqueue([](){ int value2; //code to evaluate value2 //may require significant amount of time value2 = 3; return value2; });
//Now use some fancy algorythm using values int result = futureValue1.get() + futureValue2.get(); assert(result == 6);
return result;}
std::async. с++11
8
int someExampleParallel() {
auto futureValue1 = std::async([](){ int value1; //code to evaluate value1 //may require significant amount of time value1 = 3; return value1; });
auto futureValue2 = std::async([](){ int value2; //code to evaluate value2 //may require significant amount of time value2 = 3; return value2; });
//Now use some fancy algorythm using values int result = futureValue1.get() + futureValue2.get(); assert(result == 6);
return result;}
Пример из жизни
9
void PhotoSortModel::fill(const QString &path){ unsigned numRows = invisibleRootItem()->rowCount();
auto read = [this,path](int id, int start, int stop){ for(int row = start; row < stop; ++row) { auto photo = photoItem(row); readDown(photo, path); QMetaObject::invokeMethod(this, "partialDone", Qt::DirectConnection, Q_ARG(int, id), Q_ARG(int, row-start)); } return 0; }; read(0, 0, numRows);
emit(loaded()); for(unsigned row = 0; row < numRows; ++row) itemChanged(photoItem(row));}
Пример из жизни
10
// read(0, 0, numRows);
doneMap_.clear(); doneMap_[-1] = numRows; std::list<std::future<int>> futures; unsigned numThreads = std::thread::hardware_concurrency(); for(unsigned ct = 0; ct < numThreads; ++ct) { doneMap_[ct] = 0; unsigned start = (ct*numRows)/numThreads; unsigned stop = ((ct+1)*numRows)/numThreads; if( (ct + 1) == numThreads && stop > numRows ) stop = numRows; futures.push_back(std::async(read, ct, start, stop)); } for(auto &f : futures) f.get();
Последовательный код Параллельный
365.236 64.5399
Неприятности с параллельностью
11
int dataRaceTarget = 0;
int numCycles = 1000;
auto taskP = [](int volatile &dataRaceTarget, int numCycles){ for(size_t ct = 0; ct < numCycles; ++ct) dataRaceTarget++; return 0; }; auto taskM = [](int volatile &dataRaceTarget, int numCycles){ for(size_t ct = 0; ct < numCycles; ++ct) dataRaceTarget--; return 0; };
Неприятности с параллельностью
12
std::list<std::thread> threads; for(size_t ct = 0; ct < std::thread::hardware_concurrency(); ++ct) { if(ct % 2) threads.push_back(std::thread(std::bind( taskP, std::ref(dataRaceTarget), numCycles))); else threads.push_back(std::thread(std::bind( taskM, std::ref(dataRaceTarget), numCycles))); } for(auto &t : threads) { t.join(); } std::cout << "result: " << dataRaceTarget << std::endl;
Поиск ошибок
Инстременты поиска несинхронного доступа к памяти:•valgring•clang/gcc (linux only [пока])
13
valgring
valgrind --num-callers=1 --tool=helgrind ./datarace
==77960== Possible data race during read of size 4 at 0x10480810C by thread #3==77960== Locks held: none==77960== at 0x100002D2C: main::$_0::operator()(int volatile&, int) const (datarace.cpp:15)==77960== ==77960== This conflicts with a previous write of size 4 by thread #2==77960== Locks held: none==77960== at 0x100003634: main::$_1::operator()(int volatile&, int) const (datarace.cpp:21)==77960== Address 0x10480810c is on thread #1's stack
14
dataRaceTarget++;
Thread Sunitizer (clang/gcc)
option(TUNE_THREAD_SANITIZER "Perform thread error sanitizing" OFF)
if(TUNE_THREAD_SANITIZER) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread -fPIC -fPIE" ) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=thread -pie")endif()
Thread T2 (tid=8175, running) created by main thread at: #0 pthread_create <null>:0 (libtsan.so.0+0x000000047f23) #1 std::thread::_M_start_thread(std::shared_ptr<std::thread::_Impl_base>) <null>:0 (libstdc++.so.6+0x0000000b6a90) #2 main /media/psf/Home/Google Drive/work/doc/meetingCpp/2016Ekb/code/parallelTests/datarace/datarace.cpp:30 (datarace+0x000000005d8b)
15
threads.push_back(std::thread(std::bind( taskP, std::ref(dataRaceTarget), numCycles)));
Параллельная [стандартная] библиотека / на примере gcc
16
const size_t length = 0.1 /*Gb*/ * 1024ull /*Mb*/ * 1024 /*Kb*/ * 1024 /*b*/ / sizeof(size_t) /*count*/; std::vector<size_t> vecOrigin; vecOrigin.reserve(length); for(size_t ct = 0; ct < length; ++ct) vecOrigin.push_back(ct);
auto vecToSort = vecOrigin; std::shuffle(vecToSort.begin(), vecToSort.end(), std::default_random_engine( hr_clock::now().time_since_epoch().count() ) );
auto vecToSort2 = vecToSort;
Параллельная [стандартная] библиотека / на примере gcc
17
size_t max; sw.start(); max = *std::max_element( vecToSort.begin(), vecToSort.end()); std::cout << sw.stop() << std::endl;
Параллельная [стандартная] библиотека / на примере gcc
18
size_t max; sw.start(); max = *std::max_element( vecToSort.begin(), vecToSort.end()); std::cout << sw.stop() << std::endl;
size_t max2; sw.start(); max2 = *std::__parallel::max_element( vecToSort2.begin(), vecToSort2.end()); std::cout << sw.stop() << std::endl;
if(max != length-1) throw std::runtime_error("Cant evaluete max with sequential max_element"); if(max2 != length-1) throw std::runtime_error("Cant evaluete max with parallel max_element");
Параллельная [стандартная] библиотека / на примере gcc
19
sw.start(); std::sort(vecToSort.begin(), vecToSort.end()); std::cout << sw.stop() << std::endl;
sw.start(); std::__parallel::sort(vecToSort2.begin(), vecToSort2.end()); std::cout << sw.stop() << std::endl;
Параллельная [стандартная] библиотека / на примере gcc
20
const size_t lengthData = 25000; const size_t lengthVector = 6 /*Gb*/ * 1024ull /*Mb*/ * 1024 /*Kb*/ * 1024 /*b*/ / sizeof(size_t) /*count*/ / lengthData; std::vector<std::vector<size_t>> dataBundle; dataBundle.resize(lengthData, std::vector<size_t>(lengthVector, 0)); auto gen = std::default_random_engine( hr_clock::now().time_since_epoch().count() ); for(auto &vec : dataBundle) { for(auto &value : vec) { value = gen(); } }
Параллельная [стандартная] библиотека / на примере gcc
21
std::vector<size_t> maxes(lengthData, 0); sw.start(); std::for_each( dataBundle.cbegin(), dataBundle.cend(), [&dataBundle, &maxes](const std::vector<size_t> &vec) { size_t index = &vec - dataBundle.data(); maxes[index] = *std::max_element(vec.begin(), vec.end()); } ); std::cout << sw.stop() << std::endl;
std::vector<size_t> maxes2(lengthData, 0); sw.start(); std::__parallel::for_each(dataBundle.cbegin(), dataBundle.cend(), [&dataBundle, &maxes2](const std::vector<size_t> &vec) { size_t index = &vec - dataBundle.data(); maxes2[index] = *std::max_element(vec.begin(), vec.end()); } ); std::cout << sw.stop() << std::endl;
Параллельная [стандартная] библиотека / на примере gcc
22
Результаты замены std на std::_parallel
std std::_parallel
max_element 0.015215 0.006585
sort 1.19395 0.264197
for_each 0.970016 0.356471
А если нужно большая гибкость? boost::thread
23
#include <stdlib.h>#include "boost/thread.hpp"#include <assert.h>
void someParallelTask() { while(true) { //forewer cycle boost::this_thread::disable_interruption di; //alloc some resources to work //do not interrupt me boost::this_thread::restore_interruption ri(di);
//ok check if I should die ( boost::this_thread::interruption_point(); }}
int main(int argc, char**argv) {
auto thread = boost::thread( someParallelTask );
//do some jod //And now I do not want to wait thread
thread.interrupt();
thread.join();
return 0;}
IMHO. Самая простая параллельность. OpenMP
24
int someExample() { int value1; //code to evaluate value1 //may require significant amount of time value1 = 3; int value2; //code to evaluate value2 //may require significant amount of time value2 = 3;
//Now use some fancy algorythm using values int result = value1 + value2; assert(result == 6);
return result;}
#include <omp.h>int someExampleParallel() { int value1; int value2;#pragma omp parallel sections {#pragma omp section { //code to evaluate value1 //may require significant amount of time value1 = 3; }#pragma omp section { //code to evaluate value2 //may require significant amount of time value2 = 3; } }//here we wait for all blocks
//Now use some fancy algorythm using values int result = value1 + value2; assert(result == 6);
return result;}
Итог
При достаточном уровне понимания сути алгоритма работы программы с использованием современного C++ легко
внедрять параллельную обработку данных. И этим надо пользоваться )
Я предпочитаю:std::async и std::list<std::future<T>>
boost::thread (если нужен менеджер потоков)OpenMP
25