High-Performance Tensor Transposition (HPTT) C++ Library
A C++ library for high-performance multi-threaded tensor transpositions.
transpose.h
1#pragma once
2
3#include <list>
4#include <vector>
5#include <memory>
6#include <algorithm>
7
8#include <stdio.h>
9#ifdef _OPENMP
10#include <omp.h>
11#endif
12
13#include "hptt_types.h"
14
15namespace hptt {
16
17 class Plan;
18
36template<typename floatType>
38{
39
40 public:
41
42 /***************************************************
43 * Cons, Decons, Copy, ...
44 ***************************************************/
76 Transpose( const int *sizeA,
77 const int *perm,
78 const int *outerSizeA,
79 const int *outerSizeB,
80 const int dim,
81 const floatType *A,
82 const floatType alpha,
83 floatType *B,
84 const floatType beta,
85 const SelectionMethod selectionMethod,
86 const int numThreads,
87 const int *threadIds = nullptr,
88 const bool useRowMajor = false );
89
90 Transpose(const Transpose &other);
91
92 ~Transpose();
93
94 /***************************************************
95 * Getter & Setter
96 ***************************************************/
97 bool getConjA() noexcept { return conjA_; }
98 void setConjA(bool conjA) noexcept { conjA_ = conjA; }
99 int getNumThreads() const noexcept { return numThreads_; }
100 void setNumThreads(int numThreads) noexcept { numThreads_ = numThreads; }
101 floatType getAlpha() const noexcept { return alpha_; }
102 floatType getBeta() const noexcept { return beta_; }
106 void setAlpha(floatType alpha) noexcept { alpha_ = alpha; }
110 void setBeta(floatType beta) noexcept { beta_ = beta; }
117 void setInputPtr(const floatType *A) noexcept { A_ = A; }
124 void setOutputPtr(floatType *B) noexcept { B_ = B; }
128 const floatType* getInputPtr() const noexcept { return A_; }
132 floatType* getOutputPtr() const noexcept { return B_; }
133
138 void resetThreadIds() noexcept { threadIds_.clear(); }
139
144 void setMaxAutotuningCandidates (int num) { maxAutotuningCandidates_ = num; }
145
156 void addThreadId(int threadId) noexcept {
157#ifdef _OPENMP
158 omp_set_lock(&writelock);
159 threadIds_.push_back(threadId);
160 std::sort(threadIds_.begin(), threadIds_.end());
161 omp_unset_lock(&writelock);
162#endif
163 }
164
165 void printThreadIds() const noexcept { for( auto id : threadIds_) printf("%d, ",id); printf("\n"); }
166 int getMasterThreadId() const noexcept { return threadIds_[0]; }
167
168 /***************************************************
169 * Public Methods
170 ***************************************************/
175
196 template<bool useStreamingStores=true, bool spawnThreads=true, bool betaIsZero>
197 void execute_expert() noexcept;
198
203 void execute() noexcept;
204
205 void print() noexcept;
206
207 private:
208 /***************************************************
209 * Private Methods
210 ***************************************************/
211 void createPlans( std::vector<std::shared_ptr<Plan> > &plans ) const;
212 std::shared_ptr<Plan> selectPlan( const std::vector<std::shared_ptr<Plan> > &plans );
213 void fuseIndices();
214 void skipIndices(const int *_sizeA, const int* _perm, const int *_outerSizeA, const int *_outerSizeB, const int dim);
215 void computeLeadingDimensions();
216 double loopCostHeuristic( const std::vector<int> &loopOrder ) const;
217 double parallelismCostHeuristic( const std::vector<int> &loopOrder ) const;
218 int getLocalThreadId(int myThreadId) const;
219 template<bool spawnThreads>
220 void getStartEnd(int n, int &myStart, int &myEnd) const;
221 void setParallelStrategy(int id) noexcept { selectedParallelStrategyId_ = id; }
222 void setLoopOrder(int id) noexcept { selectedLoopOrderId_ = id; }
223
224 /***************************************************
225 * Helper Methods
226 ***************************************************/
227 // parallelizes the loops by changing the value of parallelismStrategy
228 void parallelize( std::vector<int> &parallelismStrategy,
229 std::vector<int> &availableParallelismAtLoop,
230 int &totalTasks,
231 std::list<int> &primeFactors,
232 const float minBalancing,
233 const std::vector<int> &loopsAllowed) const;
234 float getLoadBalance( const std::vector<int> &parallelismStrategy ) const;
235 float estimateExecutionTime( const std::shared_ptr<Plan> plan); //execute just a few iterations and extrapolate the result
236 void verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int dim) const;
237 void getBestParallelismStrategy ( std::vector<int> &bestParallelismStrategy ) const;
238 void getBestLoopOrder( std::vector<int> &loopOrder ) const; //innermost loop idx is stored at dim_-1
239 void getLoopOrders(std::vector<std::vector<int> > &loopOrders) const;
240 void getParallelismStrategies(std::vector<std::vector<int> > &parallelismStrategies) const;
241 void getAllParallelismStrategies( std::list<int> &primeFactorsToMatch,
242 std::vector<int> &availableParallelismAtLoop,
243 std::vector<int> &achievedParallelismAtLoop,
244 std::vector<std::vector<int> > &parallelismStrategies) const;
245 void getAvailableParallelism( std::vector<int> &numTasksPerLoop ) const;
246 int getIncrement( int loopIdx ) const;
247 void executeEstimate(const Plan *plan) noexcept; // almost identical to execute, but it just executes few iterations and then extrapolates
248 double getTimeLimit() const;
249
250 const floatType* __restrict__ A_;
251 floatType* __restrict__ B_;
252 floatType alpha_;
253 floatType beta_;
254 int dim_;
255 std::vector<size_t> sizeA_;
256 std::vector<int> perm_;
257 std::vector<size_t> outerSizeA_;
258 std::vector<size_t> outerSizeB_;
259 std::vector<size_t> lda_;
260 std::vector<size_t> ldb_;
261 std::vector<int> threadIds_;
262 int numThreads_;
263 int selectedParallelStrategyId_;
264 int selectedLoopOrderId_;
265 bool conjA_;
266#ifdef _OPENMP
267 omp_lock_t writelock;
268#endif
269
270 std::shared_ptr<Plan> masterPlan_;
271 SelectionMethod selectionMethod_;
272 int maxAutotuningCandidates_;
273 static constexpr int blocking_micro_ = REGISTER_BITS / 8 / sizeof(floatType);
274 static constexpr int blocking_ = blocking_micro_ * 4;
275
276 static constexpr int infoLevel_ = 0; // determines which auxiliary messages should be printed
277};
278
279
280extern template class Transpose<float>;
281extern template class Transpose<double>;
282extern template class Transpose<FloatComplex>;
283extern template class Transpose<DoubleComplex>;
284
285}
The Transpose class encodes all information related to the execution of the tensor transposition.
Definition: transpose.h:38
void setAlpha(floatType alpha) noexcept
set the scaling factor for A
Definition: transpose.h:106
void execute() noexcept
void setInputPtr(const floatType *A) noexcept
Set the pointer for A.
Definition: transpose.h:117
void setOutputPtr(floatType *B) noexcept
Set the pointer for B.
Definition: transpose.h:124
floatType * getOutputPtr() const noexcept
Get raw-data pointer to B.
Definition: transpose.h:132
void execute_expert() noexcept
Transpose(const int *sizeA, const int *perm, const int *outerSizeA, const int *outerSizeB, const int dim, const floatType *A, const floatType alpha, floatType *B, const floatType beta, const SelectionMethod selectionMethod, const int numThreads, const int *threadIds=nullptr, const bool useRowMajor=false)
void addThreadId(int threadId) noexcept
Definition: transpose.h:156
void setBeta(floatType beta) noexcept
set the scaling factor for B
Definition: transpose.h:110
void createPlan()
Creates the plan that encodes the execution of the tensor transposition.
const floatType * getInputPtr() const noexcept
Get raw-data pointer to A.
Definition: transpose.h:128
void setMaxAutotuningCandidates(int num)
Definition: transpose.h:144
void resetThreadIds() noexcept
Clears the array that stores the OpenMP threadIds. This function should only be used in conjuction wi...
Definition: transpose.h:138
Definition: compute_node.h:3
SelectionMethod
Determines the duration of the auto-tuning process.
Definition: hptt_types.h:22