From eee1805f88eac7daa40aebe582f53f267b0393c3 Mon Sep 17 00:00:00 2001 From: Christian Zimmermann Date: Mon, 20 Nov 2023 00:41:37 +0100 Subject: [PATCH] get intrinsics vector size from environment + compile with avx -> works --- CMakeLists.txt | 16 +++ TODO | 1 - cmake/check_avx.cmake | 24 +++++ src/include/base/base.h | 1 + src/include/base/intrin.h | 25 +++++ src/include/memory/allocator.h | 3 +- src/include/operation/extensions/avx.cc.h | 121 ++++++++++++---------- src/include/operation/extensions/avx.h | 105 ++++++++++--------- 8 files changed, 189 insertions(+), 107 deletions(-) create mode 100644 cmake/check_avx.cmake create mode 100644 src/include/base/intrin.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c71f97a..d56df34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,11 +2,14 @@ cmake_minimum_required(VERSION 3.0) project(cnorxz) +include(cmake/check_avx.cmake) + execute_process(COMMAND bash "-c" "git rev-parse HEAD" OUTPUT_VARIABLE GIT_HASH OUTPUT_STRIP_TRAILING_WHITESPACE) execute_process(COMMAND bash "-c" "git tag -l --sort=refname 'v*' | tail -n1" OUTPUT_VARIABLE GIT_TAG OUTPUT_STRIP_TRAILING_WHITESPACE) execute_process(COMMAND bash "-c" "git rev-list -n 1 ${GIT_TAG}" OUTPUT_VARIABLE GIT_TAG_HASH OUTPUT_STRIP_TRAILING_WHITESPACE) message(STATUS "${GIT_HASH}") option(RUN_PIPELINE "" OFF) +option(SCALAR_BUILD "" OFF) if(RUN_PIPELINE) set(VERSION "v0.0.0-test") else() @@ -40,6 +43,19 @@ else() endif() message(STATUS "found absolute install path '${INSTALL_PATH}'") +if(NOT ${SCALAR_BUILD}) + message(STATUS "check for intrinsics") + check_avx() + if(AVX_AVAIL) + message(STATUS "AVX available") + add_definitions("-DCXZ_HAVE_AVX") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") + else() + message(STATUS "AVX not available") + endif() +endif() + +message(STATUS "check for libraries") find_package( GTest REQUIRED ) if(GTest_FOUND) include_directories(${GTEST_INCLUDE_DIRS}) diff --git a/TODO b/TODO index 7ab0e2e..0d70161 100644 --- a/TODO +++ b/TODO @@ -6,7 +6,6 @@ include/array/array_base.cc.h@120: "TODO: check if container format is trivial" include/array/array_base.cc.h@164: "check further compatibility of index/range format" [check] include/array/array_base.cc.h@319: "check further compatibility of index/range format" [check] include/base/dtype.cc.h@23: "for tuple use vector" [comment] -include/memory/allocator.h@27: "get from environment" [urgent] include/ranges/index_base.cc.h@110: "if this assert never applies, remove mPtrId (-> Defaults)" [long] include/ranges/mrange.cc.h@633: "TODO: ZRange (meta and index pos static!)" [long] include/ranges/urange.cc.h@366: "else general transform using DType (better than nothing), to be implemented" [urgent] diff --git a/cmake/check_avx.cmake b/cmake/check_avx.cmake new file mode 100644 index 0000000..2543136 --- /dev/null +++ b/cmake/check_avx.cmake @@ -0,0 +1,24 @@ + +macro(check_avx) + include(CheckCXXSourceCompiles) + set(CMAKE_REQUIRED_FLAGS "-Wall -Werror -Wpedantic -std=c++17 -mavx") + check_cxx_source_compiles(" +#include +#include +int main() +{ +const double a[4] = { 0,0,0,0 }; +const double b[4] = { 0,0,0,0 }; +double o[4] = { 0,0,0,0 }; +__m256d av = _mm256_load_pd(a); +__m256d bv = _mm256_load_pd(b); +__m256d ov = _mm256_add_pd(av, bv); +_mm256_store_pd(o, ov); +std::cout << o[0] << std::endl; +return 0; +} +" + AVX_AVAIL + ) +endmacro() + diff --git a/src/include/base/base.h b/src/include/base/base.h index 8e2c0b9..02b1742 100644 --- a/src/include/base/base.h +++ b/src/include/base/base.h @@ -29,6 +29,7 @@ #include "uuid.h" #include "utils.h" #include "config.h" +#include "intrin.h" #include "base.cc.h" diff --git a/src/include/base/intrin.h b/src/include/base/intrin.h new file mode 100644 index 0000000..e9054ec --- /dev/null +++ b/src/include/base/intrin.h @@ -0,0 +1,25 @@ +// -*- C++ -*- +/** + @file include/base/intrin.h + @brief Definitions related to intrinsics + **/ + +#ifndef __cxz_intrin_h__ +#define __cxz_intrin_h__ + +#define MAX_VSIZE 1 + +// AVX: + +#define AVX_BITS 256 +#define AVX_VSIZE (AVX_BITS/8) +#ifdef CXZ_HAVE_AVX +#undef MAX_VSIZE +#define MAX_VSIZE AVX_VSIZE +#endif + +// ...: + +// ... + +#endif diff --git a/src/include/memory/allocator.h b/src/include/memory/allocator.h index 6c7cf6d..552f2dc 100644 --- a/src/include/memory/allocator.h +++ b/src/include/memory/allocator.h @@ -9,6 +9,7 @@ #include #include #include "base/types.h" +#include "base/intrin.h" #define MIB_SIZE 1024*1024 // 1MiB #define WARN_SIZE MIB_SIZE*100 // 100 MiB @@ -24,7 +25,7 @@ namespace CNORXZ typedef T value_type; static constexpr SizeT type_size = sizeof(T); - static constexpr SizeT N = 32; // get from environment!!! + static constexpr SizeT N = MAX_VSIZE; struct VX { diff --git a/src/include/operation/extensions/avx.cc.h b/src/include/operation/extensions/avx.cc.h index 0cde129..4660b52 100644 --- a/src/include/operation/extensions/avx.cc.h +++ b/src/include/operation/extensions/avx.cc.h @@ -10,10 +10,11 @@ namespace CNORXZ * PlusCC / PlusCX * ***********************/ - constexpr decltype(auto) PlusCC::eval(const Consecutive& a, - const Consecutive& b) + inline decltype(auto) + PlusCC::eval(const Consecutive& a, + const Consecutive& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_load_pd(b.mD); __m256d ov = _mm256_add_pd(av, bv); @@ -21,8 +22,9 @@ namespace CNORXZ return o; } - constexpr decltype(auto) PlusCC::aeval(Consecutive& a, - const Consecutive& b) + inline decltype(auto) + PlusCC::aeval(Consecutive& a, + const Consecutive& b) { __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_load_pd(b.mD); @@ -32,10 +34,11 @@ namespace CNORXZ } template - static constexpr decltype(auto) - PlusCX::eval(const Consecutive& a, const X& b) + inline decltype(auto) + PlusCX::eval(const Consecutive& a, + const X& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_set1_pd( static_cast(b) ); __m256d ov = _mm256_add_pd(av, bv); @@ -44,8 +47,8 @@ namespace CNORXZ } template - static constexpr decltype(auto) - PlusCX::aeval(Consecutive& a, const X& b) + inline decltype(auto) + PlusCX::aeval(Consecutive& a, const X& b) { __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_set1_pd( static_cast(b) ); @@ -55,10 +58,11 @@ namespace CNORXZ } template - static constexpr decltype(auto) - PlusCX::eval(const X& a, const Consecutive& b) + inline decltype(auto) + PlusCX::eval(const X& a, + const Consecutive& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_set1_pd( static_cast(a) ); __m256d bv = _mm256_load_pd(b.mD); __m256d ov = _mm256_add_pd(av, bv); @@ -70,10 +74,11 @@ namespace CNORXZ * MinusCC / MinusCX * *************************/ - constexpr decltype(auto) MinusCC::eval(const Consecutive& a, - const Consecutive& b) + inline decltype(auto) + MinusCC::eval(const Consecutive& a, + const Consecutive& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_load_pd(b.mD); __m256d ov = _mm256_sub_pd(av, bv); @@ -81,8 +86,9 @@ namespace CNORXZ return o; } - constexpr decltype(auto) MinusCC::aeval(Consecutive& a, - const Consecutive& b) + inline decltype(auto) + MinusCC::aeval(Consecutive& a, + const Consecutive& b) { __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_load_pd(b.mD); @@ -92,10 +98,11 @@ namespace CNORXZ } template - static constexpr decltype(auto) - MinusCX::eval(const Consecutive& a, const X& b) + inline decltype(auto) + MinusCX::eval(const Consecutive& a, + const X& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_set1_pd( static_cast(b) ); __m256d ov = _mm256_sub_pd(av, bv); @@ -104,8 +111,8 @@ namespace CNORXZ } template - static constexpr decltype(auto) - MinusCX::aeval(Consecutive& a, const X& b) + inline decltype(auto) + MinusCX::aeval(Consecutive& a, const X& b) { __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_set1_pd( static_cast(b) ); @@ -115,10 +122,11 @@ namespace CNORXZ } template - static constexpr decltype(auto) - MinusCX::eval(const X& a, const Consecutive& b) + inline decltype(auto) + MinusCX::eval(const X& a, + const Consecutive& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_set1_pd( static_cast(a) ); __m256d bv = _mm256_load_pd(b.mD); __m256d ov = _mm256_sub_pd(av, bv); @@ -130,10 +138,11 @@ namespace CNORXZ * MultipliesCC / MultipliesCX * ***********************************/ - constexpr decltype(auto) MultipliesCC::eval(const Consecutive& a, - const Consecutive& b) + inline decltype(auto) + MultipliesCC::eval(const Consecutive& a, + const Consecutive& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_load_pd(b.mD); __m256d ov = _mm256_mul_pd(av, bv); @@ -141,8 +150,9 @@ namespace CNORXZ return o; } - constexpr decltype(auto) MultipliesCC::aeval(Consecutive& a, - const Consecutive& b) + inline decltype(auto) + MultipliesCC::aeval(Consecutive& a, + const Consecutive& b) { __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_load_pd(b.mD); @@ -152,10 +162,11 @@ namespace CNORXZ } template - static constexpr decltype(auto) - MultipliesCX::eval(const Consecutive& a, const X& b) + inline decltype(auto) + MultipliesCX::eval(const Consecutive& a, + const X& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_set1_pd( static_cast(b) ); __m256d ov = _mm256_mul_pd(av, bv); @@ -164,8 +175,9 @@ namespace CNORXZ } template - static constexpr decltype(auto) - MultipliesCX::aeval(Consecutive& a, const X& b) + inline decltype(auto) + MultipliesCX::aeval(Consecutive& a, + const X& b) { __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_set1_pd( static_cast(b) ); @@ -175,10 +187,11 @@ namespace CNORXZ } template - static constexpr decltype(auto) - MultipliesCX::eval(const X& a, const Consecutive& b) + inline decltype(auto) + MultipliesCX::eval(const X& a, + const Consecutive& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_set1_pd( static_cast(a) ); __m256d bv = _mm256_load_pd(b.mD); __m256d ov = _mm256_mul_pd(av, bv); @@ -191,10 +204,11 @@ namespace CNORXZ * DividesCC / DividesCX * *****************************/ - constexpr decltype(auto) DividesCC::eval(const Consecutive& a, - const Consecutive& b) + inline decltype(auto) + DividesCC::eval(const Consecutive& a, + const Consecutive& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_load_pd(b.mD); __m256d ov = _mm256_div_pd(av, bv); @@ -202,8 +216,9 @@ namespace CNORXZ return o; } - constexpr decltype(auto) DividesCC::aeval(Consecutive& a, - const Consecutive& b) + inline decltype(auto) + DividesCC::aeval(Consecutive& a, + const Consecutive& b) { __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_load_pd(b.mD); @@ -213,10 +228,11 @@ namespace CNORXZ } template - static constexpr decltype(auto) - DividesCX::eval(const Consecutive& a, const X& b) + inline decltype(auto) + DividesCX::eval(const Consecutive& a, + const X& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_set1_pd( static_cast(b) ); __m256d ov = _mm256_div_pd(av, bv); @@ -225,8 +241,8 @@ namespace CNORXZ } template - static constexpr decltype(auto) - DividesCX::aeval(Consecutive& a, const X& b) + inline decltype(auto) + DividesCX::aeval(Consecutive& a, const X& b) { __m256d av = _mm256_load_pd(a.mD); __m256d bv = _mm256_set1_pd( static_cast(b) ); @@ -236,10 +252,11 @@ namespace CNORXZ } template - static constexpr decltype(auto) - DividesCX::eval(const X& a, const Consecutive& b) + inline decltype(auto) + DividesCX::eval(const X& a, + const Consecutive& b) { - Consecutive o; + Consecutive o; __m256d av = _mm256_set1_pd( static_cast(a) ); __m256d bv = _mm256_load_pd(b.mD); __m256d ov = _mm256_div_pd(av, bv); diff --git a/src/include/operation/extensions/avx.h b/src/include/operation/extensions/avx.h index e68e995..20e3f52 100644 --- a/src/include/operation/extensions/avx.h +++ b/src/include/operation/extensions/avx.h @@ -6,105 +6,104 @@ #include "base/base.h" -#define AVX_VSIZE (256/8) - namespace CNORXZ { namespace AVX { static constexpr SizeT ND = AVX_VSIZE/sizeof(Double); + static constexpr SizeT NF = AVX_VSIZE/sizeof(float); } - - template <> - struct PlusCC - { - static constexpr decltype(auto) - eval(const Consecutive& a, const Consecutive& b); - static constexpr decltype(auto) - aeval(Consecutive& a, const Consecutive& b); + template <> + struct PlusCC + { + static inline decltype(auto) + eval(const Consecutive& a, const Consecutive& b); + + static inline decltype(auto) + aeval(Consecutive& a, const Consecutive& b); }; template - struct PlusCX + struct PlusCX { - static constexpr decltype(auto) - eval(const Consecutive& a, const X& b); + static inline decltype(auto) + eval(const Consecutive& a, const X& b); - static constexpr decltype(auto) - aeval(Consecutive& a, const X& b); + static inline decltype(auto) + aeval(Consecutive& a, const X& b); - static constexpr decltype(auto) - eval(const X& a, const Consecutive& b); + static inline decltype(auto) + eval(const X& a, const Consecutive& b); }; template <> - struct MinusCC + struct MinusCC { - static constexpr decltype(auto) - eval(const Consecutive& a, const Consecutive& b); + static inline decltype(auto) + eval(const Consecutive& a, const Consecutive& b); - static constexpr decltype(auto) - aeval(Consecutive& a, const Consecutive& b); + static inline decltype(auto) + aeval(Consecutive& a, const Consecutive& b); }; template - struct MinusCX + struct MinusCX { - static constexpr decltype(auto) - eval(const Consecutive& a, const X& b); + static inline decltype(auto) + eval(const Consecutive& a, const X& b); - static constexpr decltype(auto) - aeval(Consecutive& a, const X& b); + static inline decltype(auto) + aeval(Consecutive& a, const X& b); - static constexpr decltype(auto) - eval(const X& a, const Consecutive& b); + static inline decltype(auto) + eval(const X& a, const Consecutive& b); }; template <> - struct MultipliesCC + struct MultipliesCC { - static constexpr decltype(auto) - eval(const Consecutive& a, const Consecutive& b); + static inline decltype(auto) + eval(const Consecutive& a, const Consecutive& b); - static constexpr decltype(auto) - aeval(Consecutive& a, const Consecutive& b); + static inline decltype(auto) + aeval(Consecutive& a, const Consecutive& b); }; template - struct MultipliesCX + struct MultipliesCX { - static constexpr decltype(auto) - eval(const Consecutive& a, const X& b); + static inline decltype(auto) + eval(const Consecutive& a, const X& b); - static constexpr decltype(auto) - aeval(Consecutive& a, const X& b); + static inline decltype(auto) + aeval(Consecutive& a, const X& b); - static constexpr decltype(auto) - eval(const X& a, const Consecutive& b); + static inline decltype(auto) + eval(const X& a, const Consecutive& b); }; template <> - struct DividesCC + struct DividesCC { - static constexpr decltype(auto) - eval(const Consecutive& a, const Consecutive& b); + static inline decltype(auto) + eval(const Consecutive& a, const Consecutive& b); - static constexpr decltype(auto) - aeval(Consecutive& a, const Consecutive& b); + static inline decltype(auto) + aeval(Consecutive& a, const Consecutive& b); }; template - struct DividesCX + struct DividesCX { - static constexpr decltype(auto) - eval(const Consecutive& a, const X& b); + static inline decltype(auto) + eval(const Consecutive& a, const X& b); - static constexpr decltype(auto) - aeval(Consecutive& a, const X& b); + static inline decltype(auto) + aeval(Consecutive& a, const X& b); - static constexpr decltype(auto) - eval(const X& a, const Consecutive& b); + static inline decltype(auto) + eval(const X& a, const Consecutive& b); }; }