diff --git a/src/include/helper_tools.h b/src/include/helper_tools.h index aa35dd1..64bfcf7 100644 --- a/src/include/helper_tools.h +++ b/src/include/helper_tools.h @@ -68,6 +68,13 @@ namespace MultiArrayTools auto rptr(const MArray& ma) -> decltype(ma.template getRangePtr()); + template + auto get(const std::shared_ptr& i) + -> decltype(i->template getPtr()) + { + return i->template getPtr(); + } + template auto dynamic(const MArray& ma, bool slice = false) -> std::shared_ptr>>; @@ -96,6 +103,24 @@ namespace MultiArrayTools } } + // parallel: + template + inline void PFor(const std::shared_ptr& ind, + const std::function&)>& ll) + { + const int max = static_cast(ind->max()); + int i = 0; +#pragma omp parallel shared(ind,ll) private(i) + { +#pragma omp for nowait + for(i = 0; i < max; i++) { + auto ii = getIndex( ind->range() ); + ((*ii) = i)(); + ll(ii); + } + } + } + template inline auto mkOp(const std::shared_ptr& i) diff --git a/src/include/multi_array.cc.h b/src/include/multi_array.cc.h index 0549101..ecbf9b1 100644 --- a/src/include/multi_array.cc.h +++ b/src/include/multi_array.cc.h @@ -164,6 +164,20 @@ namespace MultiArrayTools return MultiArray( nrs , std::move(mCont) ); } + template + template + Slice MultiArray::slformat(const std::shared_ptr&... nrs) + { + return Slice( nrs..., mCont.data() ); + } + + template + template + ConstSlice MultiArray::slformat(const std::shared_ptr&... nrs) const + { + return ConstSlice( nrs..., mCont.data() ); + } + template const T* MultiArray::data() const { diff --git a/src/include/multi_array.h b/src/include/multi_array.h index aaa54ff..cfab630 100644 --- a/src/include/multi_array.h +++ b/src/include/multi_array.h @@ -72,7 +72,13 @@ namespace MultiArrayTools template MultiArray format(const std::tuple...>& nrs); - + + template + Slice slformat(const std::shared_ptr&... nrs); + + template + ConstSlice slformat(const std::shared_ptr&... nrs) const; + virtual const T* data() const override; virtual T* data() override; virtual vector& vdata() { return mCont; } diff --git a/src/include/type_operations.h b/src/include/type_operations.h index d40d68e..cb6903b 100644 --- a/src/include/type_operations.h +++ b/src/include/type_operations.h @@ -168,6 +168,24 @@ namespace MultiArrayTools } } + template + inline void x1add(double* o, const double* a, const double& b) + { +#pragma omp simd aligned(o, a: 32) + for(int i = 0; i < N; i++) { + o[i] = a[i] + b; + } + } + + template + inline void x2add(double* o, const double& a, const double* b) + { +#pragma omp simd aligned(o, b: 32) + for(int i = 0; i < N; i++) { + o[i] = a + b[i]; + } + } + template inline void xsadd(double* o, const double* a) { @@ -177,6 +195,150 @@ namespace MultiArrayTools } } + template + inline void xradd(double& o, const double* a) + { +#pragma omp simd reduction(+: o) aligned(a: 32) + for(int i = 0; i < N; i++) { + o += a[i]; + } + } + + template + inline void xsub(double* o, const double* a, const double* b) + { +#pragma omp simd aligned(o, a, b: 32) + for(int i = 0; i < N; i++) { + o[i] = a[i] - b[i]; + } + } + + template + inline void xssub(double* o, const double* a) + { +#pragma omp simd aligned(o, a: 32) + for(int i = 0; i < N; i++) { + o[i] -= a[i]; + } + } + + template + inline void xrsub(double& o, const double* a) + { +#pragma omp simd reduction(-: o) aligned(a: 32) + for(int i = 0; i < N; i++) { + o -= a[i]; + } + } + + template + inline void x1sub(double* o, const double* a, const double& b) + { +#pragma omp simd aligned(o, a: 32) + for(int i = 0; i < N; i++) { + o[i] = a[i] - b; + } + } + + template + inline void x2sub(double* o, const double& a, const double* b) + { +#pragma omp simd aligned(o, b: 32) + for(int i = 0; i < N; i++) { + o[i] = a - b[i]; + } + } + + template + inline void xmul(double* o, const double* a, const double* b) + { +#pragma omp simd aligned(o, a, b: 32) + for(int i = 0; i < N; i++) { + o[i] = a[i] * b[i]; + } + } + + template + inline void xsmul(double* o, const double* a) + { +#pragma omp simd aligned(o, a: 32) + for(int i = 0; i < N; i++) { + o[i] *= a[i]; + } + } + + template + inline void xrmul(double& o, const double* a) + { +#pragma omp simd reduction(*: o) aligned(a: 32) + for(int i = 0; i < N; i++) { + o *= a[i]; + } + } + + template + inline void x1mul(double* o, const double* a, const double& b) + { +#pragma omp simd aligned(o, a: 32) + for(int i = 0; i < N; i++) { + o[i] = a[i] * b; + } + } + + template + inline void x2mul(double* o, const double& a, const double* b) + { +#pragma omp simd aligned(o, b: 32) + for(int i = 0; i < N; i++) { + o[i] = a * b[i]; + } + } + + template + inline void xdiv(double* o, const double* a, const double* b) + { +#pragma omp simd aligned(o, a, b: 32) + for(int i = 0; i < N; i++) { + o[i] = a[i] / b[i]; + } + } + + template + inline void xsdiv(double* o, const double* a) + { +#pragma omp simd aligned(o, a: 32) + for(int i = 0; i < N; i++) { + o[i] /= a[i]; + } + } + /* + template + inline void xrdiv(double& o, const double* a) + { +#pragma omp simd reduction(/: o) aligned(a: 32) + for(int i = 0; i < N; i++) { + o /= a[i]; + } + } + */ + template + inline void x1div(double* o, const double* a, const double& b) + { +#pragma omp simd aligned(o, a: 32) + for(int i = 0; i < N; i++) { + o[i] = a[i] / b; + } + } + + template + inline void x2div(double* o, const double& a, const double* b) + { +#pragma omp simd aligned(o, b: 32) + for(int i = 0; i < N; i++) { + o[i] = a / b[i]; + } + } + inline v256 operator+(const v256& a, const v256& b) { v256 o; @@ -184,45 +346,131 @@ namespace MultiArrayTools return o; } + inline v256 operator+(const v256& a, const double& b) + { + v256 o; + x1add<4>( o._x, a._x, b ); + return o; + } + + inline v256 operator+(const double& a, const v256& b) + { + v256 o; + x2add<4>( o._x, a, b._x ); + return o; + } + + inline double& operator+=(double& o, const v256& a) + { + xradd<4>( o, a._x ); + return o; + } + inline v256& operator+=(v256& o, const v256& a) { - //xsadd<4>( reinterpret_cast(&o), reinterpret_cast(&a) ); xsadd<4>( o._x, a._x ); return o; } -/* + inline v256 operator-(const v256& a, const v256& b) { - v256 out; -#pragma omp simd aligned(outp, ap, bp: 32) - for(int i = 0; i < IN; ++i){ - outp[i] = ap[i] - bp[i]; - } - return out; + v256 o; + xsub<4>( o._x, a._x, b._x ); + return o; + } + + inline v256 operator-(const v256& a, const double& b) + { + v256 o; + x1sub<4>( o._x, a._x, b ); + return o; + } + + inline v256 operator-(const double& a, const v256& b) + { + v256 o; + x2sub<4>( o._x, a, b._x ); + return o; + } + + inline double& operator-=(double& o, const v256& a) + { + xrsub<4>( o, a._x ); + return o; + } + + inline v256& operator-=(v256& o, const v256& a) + { + xssub<4>( o._x, a._x ); + return o; } inline v256 operator*(const v256& a, const v256& b) { - v256 out; -#pragma omp simd aligned(outp, ap, bp: 32) - for(int i = 0; i < IN; ++i){ - outp[i] = ap[i] * bp[i]; - } - return out; + v256 o; + xmul<4>( o._x, a._x, b._x ); + return o; } - + + inline v256& operator*=(v256& o, const v256& a) + { + xsmul<4>( o._x, a._x ); + return o; + } + + inline v256 operator*(const v256& a, const double& b) + { + v256 o; + x1mul<4>( o._x, a._x, b ); + return o; + } + + inline v256 operator*(const double& a, const v256& b) + { + v256 o; + x2mul<4>( o._x, a, b._x ); + return o; + } + + inline double& operator*=(double& o, const v256& a) + { + xrmul<4>( o, a._x ); + return o; + } + inline v256 operator/(const v256& a, const v256& b) { - v256 out; -#pragma omp simd aligned(outp, ap, bp: 32) - for(int i = 0; i < IN; ++i){ - outp[i] = ap[i] / bp[i]; - } - return out; + v256 o; + xdiv<4>( o._x, a._x, b._x ); + return o; + } + + inline v256 operator/(const v256& a, const double& b) + { + v256 o; + x1div<4>( o._x, a._x, b ); + return o; + } + + inline v256 operator/(const double& a, const v256& b) + { + v256 o; + x2div<4>( o._x, a, b._x ); + return o; + } + /* + inline double& operator/=(double& o, const v256& a) + { + xrdiv<4>( o, a._x ); + return o; } */ - - + inline v256& operator/=(v256& o, const v256& a) + { + xsdiv<4>( o._x, a._x ); + return o; + } + } // namespace MultiArrayTools #endif diff --git a/src/include/xfor/xfor.h b/src/include/xfor/xfor.h index 7af5458..6768755 100644 --- a/src/include/xfor/xfor.h +++ b/src/include/xfor/xfor.h @@ -499,7 +499,7 @@ namespace MultiArrayHelper size_t mnpos = 0; ExtType npos; auto expr = mExpr; -#pragma omp parallel shared(expr,mnpos,npos) private(pos) +#pragma omp parallel shared(expr) private(pos,mnpos,npos) { #pragma omp for nowait for(pos = 0; pos < static_cast(ForBound::template bound(mMax)); pos++){ @@ -520,7 +520,7 @@ namespace MultiArrayHelper size_t mnpos = 0; ExtType npos; auto expr = mExpr; -#pragma omp parallel shared(expr,mnpos,npos) private(pos) +#pragma omp parallel shared(expr) private(pos,mnpos,npos) { #pragma omp for nowait for(pos = 0; pos < static_cast(ForBound::template bound(mMax)); pos++){