add some utilities + more simd arithmetics

2019-03-01 18:13:51 +01:00 · 2019-03-01 18:13:51 +01:00 · 636e06bd5b
commit 636e06bd5b
parent c75c585ce6
5 changed files with 319 additions and 26 deletions
--- a/src/include/helper_tools.h
+++ b/src/include/helper_tools.h
@ -68,6 +68,13 @@ namespace MultiArrayTools
    auto rptr(const MArray& ma)
 	-> decltype(ma.template getRangePtr<N>());

+    template <size_t I, class MIndex>
+    auto get(const std::shared_ptr<MIndex>& i)
+        -> decltype(i->template getPtr<I>())
+    {
+        return i->template getPtr<I>();
+    }
+    
    template <class EC, class MArray>
    auto dynamic(const MArray& ma, bool slice = false)
 	-> std::shared_ptr<MultiArrayBase<typename MArray::value_type,DynamicRange<EC>>>;
@ -96,6 +103,24 @@ namespace MultiArrayTools
 	}
    }

+    // parallel:
+    template <class IndexType>
+    inline void PFor(const std::shared_ptr<IndexType>& ind,
+                    const std::function<void(const std::shared_ptr<IndexType>&)>& ll)
+    {
+        const int max = static_cast<int>(ind->max());
+        int i = 0;
+#pragma omp parallel shared(ind,ll) private(i)
+        {
+#pragma omp for nowait
+            for(i = 0; i < max; i++) {
+                auto ii = getIndex( ind->range() );
+                ((*ii) = i)();
+                ll(ii);
+            }
+        }
+    }
+

    template <class Index>
    inline auto mkOp(const std::shared_ptr<Index>& i)
--- a/src/include/multi_array.cc.h
+++ b/src/include/multi_array.cc.h
@ -164,6 +164,20 @@ namespace MultiArrayTools
 	return MultiArray<T,SRanges2...>( nrs , std::move(mCont) );
    }

+    template <typename T, class... SRanges>
+    template <class... SRanges2>
+    Slice<T,SRanges2...> MultiArray<T,SRanges...>::slformat(const std::shared_ptr<SRanges2>&... nrs)
+    {
+        return Slice<T,SRanges2...>( nrs..., mCont.data() );
+    }
+
+    template <typename T, class... SRanges>
+    template <class... SRanges2>
+    ConstSlice<T,SRanges2...> MultiArray<T,SRanges...>::slformat(const std::shared_ptr<SRanges2>&... nrs) const
+    {
+        return ConstSlice<T,SRanges2...>( nrs..., mCont.data() );
+    }
+    
    template <typename T, class... SRanges>
    const T* MultiArray<T,SRanges...>::data() const
    {
--- a/src/include/multi_array.h
+++ b/src/include/multi_array.h
@ -73,6 +73,12 @@ namespace MultiArrayTools
 	template <class... SRanges2>
 	MultiArray<T,SRanges2...> format(const std::tuple<std::shared_ptr<SRanges2>...>& nrs);

+        template <class... SRanges2>
+        Slice<T,SRanges2...> slformat(const std::shared_ptr<SRanges2>&... nrs);
+
+        template <class... SRanges2>
+        ConstSlice<T,SRanges2...> slformat(const std::shared_ptr<SRanges2>&... nrs) const;
+
 	virtual const T* data() const override;
 	virtual T* data() override;
 	virtual vector<T>& vdata() { return mCont; }
--- a/src/include/type_operations.h
+++ b/src/include/type_operations.h
@ -168,6 +168,24 @@ namespace MultiArrayTools
 	}  
    }

+    template <int N>
+    inline void x1add(double* o, const double* a, const double& b)
+    {
+#pragma omp simd aligned(o, a: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] = a[i] + b;
+	}  
+    }
+
+    template <int N>
+    inline void x2add(double* o, const double& a, const double* b)
+    {
+#pragma omp simd aligned(o, b: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] = a + b[i];
+	}  
+    }
+
    template <int N>
    inline void xsadd(double* o, const double* a)
    {
@ -177,6 +195,150 @@ namespace MultiArrayTools
 	}  
    }

+    template <int N>
+    inline void xradd(double& o, const double* a)
+    {
+#pragma omp simd reduction(+: o) aligned(a: 32)
+	for(int i = 0; i < N; i++) {
+	    o += a[i];
+	}
+    }
+
+    template <int N>
+    inline void xsub(double* o, const double* a, const double* b)
+    {
+#pragma omp simd aligned(o, a, b: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] = a[i] - b[i];
+	}  
+    }
+
+    template <int N>
+    inline void xssub(double* o, const double* a)
+    {
+#pragma omp simd aligned(o, a: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] -= a[i];
+	}  
+    }
+
+    template <int N>
+    inline void xrsub(double& o, const double* a)
+    {
+#pragma omp simd reduction(-: o) aligned(a: 32)
+	for(int i = 0; i < N; i++) {
+	    o -= a[i];
+	}
+    }
+
+    template <int N>
+    inline void x1sub(double* o, const double* a, const double& b)
+    {
+#pragma omp simd aligned(o, a: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] = a[i] - b;
+	}  
+    }
+
+    template <int N>
+    inline void x2sub(double* o, const double& a, const double* b)
+    {
+#pragma omp simd aligned(o, b: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] = a - b[i];
+	}  
+    }
+
+    template <int N>
+    inline void xmul(double* o, const double* a, const double* b)
+    {
+#pragma omp simd aligned(o, a, b: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] = a[i] * b[i];
+	}  
+    }
+
+    template <int N>
+    inline void xsmul(double* o, const double* a)
+    {
+#pragma omp simd aligned(o, a: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] *= a[i];
+	}  
+    }
+
+    template <int N>
+    inline void xrmul(double& o, const double* a)
+    {
+#pragma omp simd reduction(*: o) aligned(a: 32)
+	for(int i = 0; i < N; i++) {
+	    o *= a[i];
+	}
+    }
+
+    template <int N>
+    inline void x1mul(double* o, const double* a, const double& b)
+    {
+#pragma omp simd aligned(o, a: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] = a[i] * b;
+	}  
+    }
+
+    template <int N>
+    inline void x2mul(double* o, const double& a, const double* b)
+    {
+#pragma omp simd aligned(o, b: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] = a * b[i];
+	}  
+    }
+
+    template <int N>
+    inline void xdiv(double* o, const double* a, const double* b)
+    {
+#pragma omp simd aligned(o, a, b: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] = a[i] / b[i];
+	}  
+    }
+
+    template <int N>
+    inline void xsdiv(double* o, const double* a)
+    {
+#pragma omp simd aligned(o, a: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] /= a[i];
+	}  
+    }
+    /*
+    template <int N>
+    inline void xrdiv(double& o, const double* a)
+    {
+#pragma omp simd reduction(/: o) aligned(a: 32)
+	for(int i = 0; i < N; i++) {
+	    o /= a[i];
+	}
+    }
+    */
+    template <int N>
+    inline void x1div(double* o, const double* a, const double& b)
+    {
+#pragma omp simd aligned(o, a: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] = a[i] / b;
+	}  
+    }
+
+    template <int N>
+    inline void x2div(double* o, const double& a, const double* b)
+    {
+#pragma omp simd aligned(o, b: 32)
+	for(int i = 0; i < N; i++) {
+	    o[i] = a / b[i];
+	}  
+    }
+
    inline v256 operator+(const v256& a, const v256& b)
    {
 	v256 o;
@ -184,44 +346,130 @@ namespace MultiArrayTools
 	return o;
    }

+    inline v256 operator+(const v256& a, const double& b)
+    {
+	v256 o;
+	x1add<4>( o._x, a._x, b );
+	return o;
+    }
+
+    inline v256 operator+(const double& a, const v256& b)
+    {
+	v256 o;
+	x2add<4>( o._x, a, b._x );
+	return o;
+    }
+
+    inline double& operator+=(double& o, const v256& a)
+    {
+        xradd<4>( o, a._x );
+	return o;
+    }
+
    inline v256& operator+=(v256& o, const v256& a)
    {
-	//xsadd<4>( reinterpret_cast<double*>(&o), reinterpret_cast<const double*>(&a) );
        xsadd<4>( o._x, a._x );
 	return o;
    }
-/*
+
    inline v256 operator-(const v256& a, const v256& b)
    {
-        	v256 out;
-#pragma omp simd aligned(outp, ap, bp: 32)
-	for(int i = 0; i < IN; ++i){
-	    outp[i] = ap[i] - bp[i];
-	}
-	return out;
+	v256 o;
+	xsub<4>( o._x, a._x, b._x );
+	return o;
+    }
+
+    inline v256 operator-(const v256& a, const double& b)
+    {
+	v256 o;
+	x1sub<4>( o._x, a._x, b );
+	return o;
+    }
+
+    inline v256 operator-(const double& a, const v256& b)
+    {
+	v256 o;
+	x2sub<4>( o._x, a, b._x );
+	return o;
+    }
+
+    inline double& operator-=(double& o, const v256& a)
+    {
+        xrsub<4>( o, a._x );
+	return o;
+    }
+
+    inline v256& operator-=(v256& o, const v256& a)
+    {
+        xssub<4>( o._x, a._x );
+	return o;
    }

    inline v256 operator*(const v256& a, const v256& b)
    {
-	v256 out;
-#pragma omp simd aligned(outp, ap, bp: 32)
-	for(int i = 0; i < IN; ++i){
-	    outp[i] = ap[i] * bp[i];
-	}
-	return out;
+	v256 o;
+	xmul<4>( o._x, a._x, b._x );
+	return o;
+    }
+
+    inline v256& operator*=(v256& o, const v256& a)
+    {
+        xsmul<4>( o._x, a._x );
+	return o;
+    }
+
+    inline v256 operator*(const v256& a, const double& b)
+    {
+	v256 o;
+	x1mul<4>( o._x, a._x, b );
+	return o;
+    }
+
+    inline v256 operator*(const double& a, const v256& b)
+    {
+	v256 o;
+	x2mul<4>( o._x, a, b._x );
+	return o;
+    }
+
+    inline double& operator*=(double& o, const v256& a)
+    {
+        xrmul<4>( o, a._x );
+	return o;
    }

    inline v256 operator/(const v256& a, const v256& b)
    {
-	v256 out;
-#pragma omp simd aligned(outp, ap, bp: 32)
-	for(int i = 0; i < IN; ++i){
-	    outp[i] = ap[i] / bp[i];
-	}
-	return out;
+	v256 o;
+	xdiv<4>( o._x, a._x, b._x );
+	return o;
+    }
+
+    inline v256 operator/(const v256& a, const double& b)
+    {
+	v256 o;
+	x1div<4>( o._x, a._x, b );
+	return o;
+    }
+
+    inline v256 operator/(const double& a, const v256& b)
+    {
+	v256 o;
+	x2div<4>( o._x, a, b._x );
+	return o;
+    }
+    /*
+    inline double& operator/=(double& o, const v256& a)
+    {
+        xrdiv<4>( o, a._x );
+	return o;
    }
    */
-    
+    inline v256& operator/=(v256& o, const v256& a)
+    {
+        xsdiv<4>( o._x, a._x );
+	return o;
+    }
   
 } // namespace MultiArrayTools

--- a/src/include/xfor/xfor.h
+++ b/src/include/xfor/xfor.h
@ -499,7 +499,7 @@ namespace MultiArrayHelper
        size_t mnpos = 0;
        ExtType npos;
        auto expr = mExpr;
-#pragma omp parallel shared(expr,mnpos,npos) private(pos)
+#pragma omp parallel shared(expr) private(pos,mnpos,npos)
        {
 #pragma omp for nowait
            for(pos = 0; pos < static_cast<int>(ForBound<RangeType::ISSTATIC>::template bound<RangeType::SIZE>(mMax)); pos++){
@ -520,7 +520,7 @@ namespace MultiArrayHelper
        size_t mnpos = 0;
        ExtType npos;
        auto expr = mExpr;
-#pragma omp parallel shared(expr,mnpos,npos) private(pos)
+#pragma omp parallel shared(expr) private(pos,mnpos,npos)
        {
 #pragma omp for nowait
            for(pos = 0; pos < static_cast<int>(ForBound<RangeType::ISSTATIC>::template bound<RangeType::SIZE>(mMax)); pos++){