13#include "hptt_types.h"
36template<
typename floatType>
78 const int *outerSizeA,
79 const int *outerSizeB,
82 const floatType alpha,
87 const int *threadIds =
nullptr,
88 const bool useRowMajor =
false );
97 bool getConjA() noexcept {
return conjA_; }
98 void setConjA(
bool conjA)
noexcept { conjA_ = conjA; }
99 int getNumThreads() const noexcept {
return numThreads_; }
100 void setNumThreads(
int numThreads)
noexcept { numThreads_ = numThreads; }
101 floatType getAlpha() const noexcept {
return alpha_; }
102 floatType getBeta() const noexcept {
return beta_; }
106 void setAlpha(floatType alpha)
noexcept { alpha_ = alpha; }
110 void setBeta(floatType beta)
noexcept { beta_ = beta; }
158 omp_set_lock(&writelock);
159 threadIds_.push_back(threadId);
160 std::sort(threadIds_.begin(), threadIds_.end());
161 omp_unset_lock(&writelock);
165 void printThreadIds() const noexcept {
for(
auto id : threadIds_) printf(
"%d, ",
id); printf(
"\n"); }
166 int getMasterThreadId() const noexcept {
return threadIds_[0]; }
196 template<
bool useStreamingStores=true,
bool spawnThreads=true,
bool betaIsZero>
205 void print() noexcept;
211 void createPlans( std::vector<std::shared_ptr<Plan> > &plans ) const;
212 std::shared_ptr<Plan> selectPlan( const std::vector<std::shared_ptr<Plan> > &plans );
214 void skipIndices(const
int *_sizeA, const
int* _perm, const
int *_outerSizeA, const
int *_outerSizeB, const
int dim);
215 void computeLeadingDimensions();
216 double loopCostHeuristic( const std::vector<
int> &loopOrder ) const;
217 double parallelismCostHeuristic( const std::vector<
int> &loopOrder ) const;
218 int getLocalThreadId(
int myThreadId) const;
219 template<
bool spawnThreads>
220 void getStartEnd(
int n,
int &myStart,
int &myEnd) const;
221 void setParallelStrategy(
int id) noexcept { selectedParallelStrategyId_ = id; }
222 void setLoopOrder(
int id)
noexcept { selectedLoopOrderId_ = id; }
228 void parallelize( std::vector<int> ¶llelismStrategy,
229 std::vector<int> &availableParallelismAtLoop,
231 std::list<int> &primeFactors,
232 const float minBalancing,
233 const std::vector<int> &loopsAllowed)
const;
234 float getLoadBalance(
const std::vector<int> ¶llelismStrategy )
const;
235 float estimateExecutionTime(
const std::shared_ptr<Plan> plan);
236 void verifyParameter(
const int *size,
const int* perm,
const int* outerSizeA,
const int* outerSizeB,
const int dim)
const;
237 void getBestParallelismStrategy ( std::vector<int> &bestParallelismStrategy )
const;
238 void getBestLoopOrder( std::vector<int> &loopOrder )
const;
239 void getLoopOrders(std::vector<std::vector<int> > &loopOrders)
const;
240 void getParallelismStrategies(std::vector<std::vector<int> > ¶llelismStrategies)
const;
241 void getAllParallelismStrategies( std::list<int> &primeFactorsToMatch,
242 std::vector<int> &availableParallelismAtLoop,
243 std::vector<int> &achievedParallelismAtLoop,
244 std::vector<std::vector<int> > ¶llelismStrategies)
const;
245 void getAvailableParallelism( std::vector<int> &numTasksPerLoop )
const;
246 int getIncrement(
int loopIdx )
const;
247 void executeEstimate(
const Plan *plan)
noexcept;
248 double getTimeLimit()
const;
250 const floatType* __restrict__ A_;
251 floatType* __restrict__ B_;
255 std::vector<size_t> sizeA_;
256 std::vector<int> perm_;
257 std::vector<size_t> outerSizeA_;
258 std::vector<size_t> outerSizeB_;
259 std::vector<size_t> lda_;
260 std::vector<size_t> ldb_;
261 std::vector<int> threadIds_;
263 int selectedParallelStrategyId_;
264 int selectedLoopOrderId_;
267 omp_lock_t writelock;
270 std::shared_ptr<Plan> masterPlan_;
272 int maxAutotuningCandidates_;
273 static constexpr int blocking_micro_ = REGISTER_BITS / 8 /
sizeof(floatType);
274 static constexpr int blocking_ = blocking_micro_ * 4;
276 static constexpr int infoLevel_ = 0;
280extern template class Transpose<float>;
281extern template class Transpose<double>;
282extern template class Transpose<FloatComplex>;
283extern template class Transpose<DoubleComplex>;
The Transpose class encodes all information related to the execution of the tensor transposition.
Definition: transpose.h:38
void setAlpha(floatType alpha) noexcept
set the scaling factor for A
Definition: transpose.h:106
void setInputPtr(const floatType *A) noexcept
Set the pointer for A.
Definition: transpose.h:117
void setOutputPtr(floatType *B) noexcept
Set the pointer for B.
Definition: transpose.h:124
floatType * getOutputPtr() const noexcept
Get raw-data pointer to B.
Definition: transpose.h:132
void execute_expert() noexcept
Transpose(const int *sizeA, const int *perm, const int *outerSizeA, const int *outerSizeB, const int dim, const floatType *A, const floatType alpha, floatType *B, const floatType beta, const SelectionMethod selectionMethod, const int numThreads, const int *threadIds=nullptr, const bool useRowMajor=false)
void addThreadId(int threadId) noexcept
Definition: transpose.h:156
void setBeta(floatType beta) noexcept
set the scaling factor for B
Definition: transpose.h:110
void createPlan()
Creates the plan that encodes the execution of the tensor transposition.
const floatType * getInputPtr() const noexcept
Get raw-data pointer to A.
Definition: transpose.h:128
void setMaxAutotuningCandidates(int num)
Definition: transpose.h:144
void resetThreadIds() noexcept
Clears the array that stores the OpenMP threadIds. This function should only be used in conjuction wi...
Definition: transpose.h:138
Definition: compute_node.h:3
SelectionMethod
Determines the duration of the auto-tuning process.
Definition: hptt_types.h:22