From cf7dcb816b5121cb9d654dc379a5a414746c53cd Mon Sep 17 00:00:00 2001 From: Christian Zimmermann Date: Tue, 25 Oct 2022 23:45:05 +0200 Subject: [PATCH] WIP: avx --- src/include/operation/extensions/avx.cc.h | 48 ++++++++ src/include/operation/extensions/avx.h | 51 +++++++++ .../operation/extensions/extensions.cc.h | 11 ++ src/include/operation/extensions/extensions.h | 13 +++ src/include/operation/extensions/reg.cc.h | 35 ++++-- src/include/operation/extensions/reg.h | 104 ++++++++++++++++-- src/include/operation/op_types.cc.h | 1 + src/include/xpr/pos_type.cc.h | 4 +- 8 files changed, 249 insertions(+), 18 deletions(-) create mode 100644 src/include/operation/extensions/avx.cc.h create mode 100644 src/include/operation/extensions/avx.h create mode 100644 src/include/operation/extensions/extensions.cc.h create mode 100644 src/include/operation/extensions/extensions.h diff --git a/src/include/operation/extensions/avx.cc.h b/src/include/operation/extensions/avx.cc.h new file mode 100644 index 0000000..298769a --- /dev/null +++ b/src/include/operation/extensions/avx.cc.h @@ -0,0 +1,48 @@ + +#ifndef __cxz_avx_cc_h__ +#define __cxz_avx_cc_h__ + +#include "avx.h" + +namespace CNORXZ +{ + inline decltype(auto) MkConsecutive::make(const Double* d) + { + return *reinterpret_cast( d ); + } + + inline decltype(auto) MkConsecutive::make(Double* d) + { + return *reinterpret_cast( d ); + } + + template + inline decltype(auto) MkConsecutive::makeA(Args&&... args) + { + static_assert(sizeof...(Args) == AVX_SIZE/sizeof(Double), + "got inconsistent number of arguments"); + return AVX::ConsecutiveD { _mm256_setr_pd(args...); } + } + + inline decltype(auto) MkConsecutive::make(const Int* d) + { + return *reinterpret_cast( d ); + } + + inline decltype(auto) MkConsecutive::make(Int* d) + { + return *reinterpret_cast( d ); + } + + template + inline decltype(auto) MkConsecutive::makeA(Args&&... args) + { + static_assert(sizeof(Int) == 32/8, "lib error: Int size has changed"); + static_assert(sizeof...(Args) == AVX_SIZE/sizeof(Int), + "got inconsistent number of arguments"); + return AVX::ConsecutiveI { _mm256_setr_epi32(args...); } + } + +} + +#endif diff --git a/src/include/operation/extensions/avx.h b/src/include/operation/extensions/avx.h new file mode 100644 index 0000000..c03dfa2 --- /dev/null +++ b/src/include/operation/extensions/avx.h @@ -0,0 +1,51 @@ + +#ifndef __cxz_avx_h__ +#define __cxz_avx_h__ + +#include + +#include "base/base.h" + +#define AVX_VSIZE (256/8) + +namespace CNORXZ +{ + namespace AVX + { + // define for all types that are defined in base/types.h + struct ConsecutiveD + { + __m256d mD; + }; + + struct ConsecutiveI + { + __m256i mD; + }; + } + + template <> + struct MkConsecutive + { + static inline decltype(auto) make(const Double* d); + + static inline decltype(auto) make(Double* d); + + template + static inline decltype(auto) makeA(Args&&... args); + }; + + template <> + struct MkConsecutive + { + static inline decltype(auto) make(const Int* d); + + static inline decltype(auto) make(Int* d); + + template + static inline decltype(auto) makeA(Args&&... args); + }; + +} + +#endif diff --git a/src/include/operation/extensions/extensions.cc.h b/src/include/operation/extensions/extensions.cc.h new file mode 100644 index 0000000..1bdc7d9 --- /dev/null +++ b/src/include/operation/extensions/extensions.cc.h @@ -0,0 +1,11 @@ + +#ifndef __cxz_extensions_cc_h__ +#define __cxz_extensions_cc_h__ + +#include "reg.cc.h" + +#if CXZ_HAVE_AVX +#include "avx.cc.h" +#endif + +#endif diff --git a/src/include/operation/extensions/extensions.h b/src/include/operation/extensions/extensions.h new file mode 100644 index 0000000..8a52a6b --- /dev/null +++ b/src/include/operation/extensions/extensions.h @@ -0,0 +1,13 @@ + +#ifndef __cxz_extensions_h__ +#define __cxz_extensions_h__ + +#include "reg.h" + +#if CXZ_HAVE_AVX +#include "avx.h" +#endif + +#include "extensions.cc.h" + +#endif diff --git a/src/include/operation/extensions/reg.cc.h b/src/include/operation/extensions/reg.cc.h index fc240c8..634a2c7 100644 --- a/src/include/operation/extensions/reg.cc.h +++ b/src/include/operation/extensions/reg.cc.h @@ -7,22 +7,40 @@ namespace CNORXZ { + template + inline decltype(auto) MkConsecutive::make(const T* d) + { + return *reinterpret_cast*>(d); + } + + template + inline decltype(auto) MkConsecutive::make(T* d) + { + return *reinterpret_cast*>(d); + } + + template + template + inline decltype(auto) MkConsecutive::makeA(Args&&... args) + { + return Consecutive { args... }; + } + template - decltype(auto) vregi(const T* d, const EPosT& pos, std::index_sequence is) + inline decltype(auto) vregi(const T* d, const EPosT& pos, std::index_sequence is) { constexpr SizeT N = epos_size::value; static_assert(N == sizeof...(Is), "got inconsistent index sequence"); - return PseudoReg { d[pos.val()+pos.template get().val()]... }; + return MkConsecutive::makeA( d[pos.val()+pos.template get().val()]... ); } - + template - decltype(auto) vreg(const T* d, const EPosT& pos) + inline decltype(auto) vreg(const T* d, const EPosT& pos) { constexpr SizeT N = epos_size::value; static_assert(is_epos_type::value, "got non-epos-type"); if constexpr(pos_type_is_consecutive::value){ - return *reinterpret_cast*> - (d+pos.val()+pos.template get<0>().val()); + return MkConsecutive::make(d+pos.val()+pos.template get<0>().val()); } else { return vregi(d, pos, std::make_index_sequence{}); @@ -30,13 +48,12 @@ namespace CNORXZ } template - decltype(auto) vreg(T* d, const EPosT& pos) + inline decltype(auto) vreg(T* d, const EPosT& pos) { constexpr SizeT N = epos_size::value; static_assert(is_epos_type::value, "got non-epos-type"); if constexpr(pos_type_is_consecutive::value){ - return *reinterpret_cast*> - (d+pos.val()+pos.template get<0>().val()); + return MkConsecutive::make(d+pos.val()+pos.template get<0>().val()); } else { return vregi(d, pos, std::make_index_sequence{}); diff --git a/src/include/operation/extensions/reg.h b/src/include/operation/extensions/reg.h index 8a84f71..f1c2d26 100644 --- a/src/include/operation/extensions/reg.h +++ b/src/include/operation/extensions/reg.h @@ -11,23 +11,113 @@ namespace CNORXZ // no use of Arr = std::array here, since I want ensure that // it has exactly a memory size of N template - struct PseudoReg + struct Consecutive { T mD[N]; }; + // specialize for all kinds of available vector registers: + template + struct MkConsecutive + { + static inline decltype(auto) make(const T* d); + + static inline decltype(auto) make(T* d); + + template + static inline decltype(auto) makeA(Args&&... args); + }; + + + /**************************************** + * consecutive generating functions * + ****************************************/ + template - decltype(auto) vregi(const T* d, const EPosT& pos, std::index_sequence is); + inline decltype(auto) vregi(const T* d, const EPosT& pos, std::index_sequence is); - // specialize for all kinds of available vector registers: template - decltype(auto) vreg(const T* d, const EPosT& pos); + inline decltype(auto) vreg(const T* d, const EPosT& pos); - // specialize for all kinds of available vector registers: template - decltype(auto) vreg(T* d, const EPosT& pos); + inline decltype(auto) vreg(T* d, const EPosT& pos); + + /****************************** + * basic operations: plus * + ******************************/ + + template + constexpr Consecutive operator+(const Consecutive& a, const Consecutive& b); + + template + constexpr Consecutive operator+(const Consecutive& a, const T& b); + + template + constexpr Consecutive operator+(const T& a, const Consecutive& b); + + template + constexpr Consecutive operator+=(const Consecutive& a); + + template + constexpr Consecutive operator+=(const T& a); + + /******************************* + * basic operations: minus * + *******************************/ + + template + constexpr Consecutive operator-(const Consecutive& a, const Consecutive& b); + + template + constexpr Consecutive operator-(const Consecutive& a, const T& b); + + template + constexpr Consecutive operator-(const T& a, const Consecutive& b); + + template + constexpr Consecutive operator-=(const Consecutive& a); + + template + constexpr Consecutive operator-=(const T& a); + + /*********************************** + * basic operations: muliplies * + ***********************************/ + + template + constexpr Consecutive operator*(const Consecutive& a, const Consecutive& b); + + template + constexpr Consecutive operator*(const Consecutive& a, const T& b); + + template + constexpr Consecutive operator*(const T& a, const Consecutive& b); + + template + constexpr Consecutive operator*=(const Consecutive& a); + + template + constexpr Consecutive operator*=(const T& a); + + /********************************* + * basic operations: divides * + *********************************/ + + template + constexpr Consecutive operator/(const Consecutive& a, const Consecutive& b); + + template + constexpr Consecutive operator/(const Consecutive& a, const T& b); + + template + constexpr Consecutive operator/(const T& a, const Consecutive& b); + + template + constexpr Consecutive operator/=(const Consecutive& a); + + template + constexpr Consecutive operator/=(const T& a); - // TODO: Maybe specialize PseudoReg (-> Reg) itself (?) } #endif diff --git a/src/include/operation/op_types.cc.h b/src/include/operation/op_types.cc.h index 45fd3b2..d50239d 100644 --- a/src/include/operation/op_types.cc.h +++ b/src/include/operation/op_types.cc.h @@ -6,6 +6,7 @@ #include "xpr/pos_type.h" #include "xpr/op_xpr.h" #include "op_utility.h" +#include "extensions/extensions.h" namespace CNORXZ { diff --git a/src/include/xpr/pos_type.cc.h b/src/include/xpr/pos_type.cc.h index e472ded..c635180 100644 --- a/src/include/xpr/pos_type.cc.h +++ b/src/include/xpr/pos_type.cc.h @@ -575,10 +575,10 @@ namespace CNORXZ { if constexpr(is_static_pos_type::value and is_static_pos_type::value){ if constexpr(sizeof...(OPosTs) != 0){ - return OPosT1().val() < OPosT2().val() and pos_types_consecutive::value; + return OPosT1().val()+1 == OPosT2().val() and pos_types_consecutive::value; } else { - return OPosT1().val() < OPosT2().val(); + return OPosT1().val()+1 == OPosT2().val(); } } return false;