NEON Co-processor の 128 bit レジスタを double (64 bit) 2個、float (32 bit) 4個、long (64 bit) 2個、int (32 bit) 4個、short (16 bit) 8個、byte (8 bit) 16個 として扱うための共用体 Vec128 を定義する。
Vec128.h |
#pragma once #include <string> #include <cstdint> #include <sstream> #include <iomanip> struct alignas(16) Vec128 { public: union { int8_t m_I8[16]; int16_t m_I16[8]; int32_t m_I32[4]; int64_t m_I64[2]; uint8_t m_U8[16]; uint16_t m_U16[8]; uint32_t m_U32[4]; uint64_t m_U64[2]; float m_F32[4]; double m_F64[2]; }; private: template <typename T> std::string ToStringInt(const T* x, int n, int w) { std::ostringstream oss; for (int i=0; i<n; i++) { oss << std::setw(w) << (int64_t)x[i]; if (i + 1 == n / 2) oss << " |"; } return oss.str(); } template <typename T> std::string ToStringUint(const T* x, int n, int w) { std::ostringstream oss; for (int i=0; i<n; i++) { oss << std::setw(w) << (uint64_t)x[i]; if (i + 1 == n / 2) oss << " |"; } return oss.str(); } template <typename T> std::string ToStringHex(const T* x, int n, int w) { std::ostringstream oss; for (int i=0; i<n; i++) { const int w_temp = 16; std::ostringstream oss_temp; oss_temp << std::uppercase << std::hex << std::setfill('0'); oss_temp << std::setw(w_temp) << (uint64_t)x[i]; std::string s1 = oss_temp.str(); std::string s2 = s1.substr(w_temp - sizeof(T) * 2); oss << std::setw(w) << s2; if (i + 1 == n / 2) oss << " |"; } return oss.str(); } template <typename T> std::string ToStringFP(const T* x, int n, int w, int p) { std::ostringstream oss; oss << std::fixed << std::setprecision(p); for (int i=0; i<n; i++) { oss << std::setw(w) << x[i]; if (i + 1 == n / 2) oss << " |"; } return oss.str(); } public: std::string ToStringI8(void) { return ToStringInt(m_I8, sizeof(m_I8) / sizeof(int8_t), 4); } std::string ToStringI16(void) { return ToStringInt(m_I16, sizeof(m_I16) / sizeof(int16_t), 8); } std::string ToStringI32(void) { return ToStringInt(m_I32, sizeof(m_I32) / sizeof(int32_t), 16); } std::string ToStringI64(void) { return ToStringInt(m_I64, sizeof(m_I64) / sizeof(int64_t), 32); } std::string ToStringU8(void) { return ToStringUint(m_U8, sizeof(m_U8) / sizeof(uint8_t), 4); } std::string ToStringU16(void) { return ToStringUint(m_U16, sizeof(m_U16) / sizeof(uint16_t), 8); } std::string ToStringU32(void) { return ToStringUint(m_I32, sizeof(m_I32) / sizeof(int32_t), 16); } std::string ToStringU64(void) { return ToStringUint(m_I64, sizeof(m_I64) / sizeof(int64_t), 32); } std::string ToStringX8(void) { return ToStringHex(m_U8, sizeof(m_U8) / sizeof(uint8_t), 4); } std::string ToStringX16(void) { return ToStringHex(m_U16, sizeof(m_U16) / sizeof(uint16_t), 8); } std::string ToStringX32(void) { return ToStringHex(m_I32, sizeof(m_I32) / sizeof(int32_t), 16); } std::string ToStringX64(void) { return ToStringHex(m_I64, sizeof(m_I64) / sizeof(int64_t), 32); } std::string ToStringF32(void) { return ToStringFP(m_F32, sizeof(m_F32) / sizeof(float), 16, 6); } std::string ToStringF64(void) { return ToStringFP(m_F64, sizeof(m_F64) / sizeof(double), 32, 12); } }; |
NEON を使って基本演算を行うプログラムの説明。
NEON を使って比較演算を行うプログラムの説明。
NEON を使って型変換を行うプログラムの説明。
NEON を使って相関係数の計算を行うプログラムの説明。
略
AlignedMem.h |
#pragma once #include <cstdint> #include <stdexcept> #include <stdlib.h> class AlignedMem { public: static void* Allocate(size_t mem_size, size_t mem_alignment) { void *ptr; if (posix_memalign(&ptr, mem_alignment, mem_size) != 0) throw std::runtime_error("Memory allocation error: AllocateAlignedMem()"); return ptr; } static void Release(void* p) { free(p); } template <typename T> static bool IsAligned(const T* p, size_t alignment) { if (p == nullptr) return false; if (((uintptr_t)p % alignment) != 0) return false; return true; } }; template <class T> class AlignedArray { T* m_Data; size_t m_Size; public: AlignedArray(void) = delete; AlignedArray(const AlignedArray& aa) = delete; AlignedArray(AlignedArray&& aa) = delete; AlignedArray& operator = (const AlignedArray& aa) = delete; AlignedArray& operator = (AlignedArray&& aa) = delete; AlignedArray(size_t size, size_t alignment) { m_Size = size; m_Data = (T*)AlignedMem::Allocate(size * sizeof(T), alignment); } ~AlignedArray() { AlignedMem::Release(m_Data); } T* Data(void) { return m_Data; } size_t Size(void) { return m_Size; } void Fill(T val) { for (size_t i = 0; i < m_Size; i++) m_Data[i] = val; } }; |
MatrixF32.h |
//------------------------------------------------- // MatrixF32.h //------------------------------------------------- #pragma once #include <iostream> #include <iomanip> #include <random> #include <string> #include <cstring> #include <stdexcept> #include "AlignedMem.h" class MatrixF32 { static const size_t c_Alignment = 16; size_t m_NumRows; size_t m_NumCols; size_t m_NumElements; size_t m_DataSize; float* m_Data; int m_OstreamW; std::string m_OstreamDelim; bool IsConforming(size_t num_rows, size_t num_cols) const { return m_NumRows == num_rows && m_NumCols == num_cols; } void Allocate(size_t num_rows, size_t num_cols) { m_NumRows = num_rows; m_NumCols = num_cols; m_NumElements = m_NumRows * m_NumCols; m_DataSize = m_NumElements * sizeof(float); if (m_NumElements == 0) m_Data = nullptr; else m_Data = (float*)AlignedMem::Allocate(m_DataSize, c_Alignment); SetOstream(10, " "); } void Cleanup(void) { m_NumRows = m_NumCols = m_NumElements = m_DataSize = 0; m_Data = nullptr; } void Release(void) { if (m_Data != nullptr) AlignedMem::Release(m_Data); Cleanup(); } public: MatrixF32(void) { Allocate(0, 0); } MatrixF32(size_t num_rows, size_t num_cols) { Allocate(num_rows, num_cols); Fill(0); } MatrixF32(size_t num_rows, size_t num_cols, bool set_identity) { Allocate(num_rows, num_cols); Fill(0); if (set_identity && m_NumRows == m_NumCols) { for (size_t i = 0; i < num_rows; i++) m_Data[i * m_NumCols + i] = 1.0f; } } MatrixF32(const MatrixF32& mat) { Allocate(mat.m_NumRows, mat.m_NumCols); memcpy(m_Data, mat.m_Data, m_DataSize); SetOstream(mat.m_OstreamW, mat.m_OstreamDelim); } MatrixF32(MatrixF32&& mat) { m_NumRows = mat.m_NumRows; m_NumCols = mat.m_NumCols; m_NumElements = mat.m_NumElements; m_DataSize = mat.m_DataSize; m_Data = mat.m_Data; m_OstreamW = mat.m_OstreamW; m_OstreamDelim = mat.m_OstreamDelim; mat.Cleanup(); } ~MatrixF32() { Release(); } MatrixF32& operator = (const MatrixF32& mat) { if (this != &mat) { if (!IsConforming(*this, mat)) { Release(); Allocate(mat.m_NumRows, mat.m_NumCols); } memcpy(m_Data, mat.m_Data, m_DataSize); SetOstream(mat.m_OstreamW, mat.m_OstreamDelim); } return *this; } MatrixF32& operator = (MatrixF32&& mat) { Release(); m_NumRows = mat.m_NumRows; m_NumCols = mat.m_NumCols; m_NumElements = mat.m_NumElements; m_DataSize = mat.m_DataSize; m_Data = mat.m_Data; SetOstream(mat.m_OstreamW, mat.m_OstreamDelim); mat.Cleanup(); return *this; } friend bool operator == (const MatrixF32& mat1, const MatrixF32& mat2) { if (!IsConforming(mat1, mat2)) return false; return (memcmp(mat1.m_Data, mat2.m_Data, mat1.m_DataSize) == 0) ? true : false; } friend bool operator != (const MatrixF32& mat1, const MatrixF32& mat2) { if (!IsConforming(mat1, mat2)) return false; return (memcmp(mat1.m_Data, mat2.m_Data, mat1.m_DataSize) != 0) ? true : false; } float* Data(void) { return m_Data; } const float* Data(void) const { return m_Data; } size_t GetNumRows(void) const { return m_NumRows; } size_t GetNumCols(void) const { return m_NumCols; } size_t GetNumElements(void) const { return m_NumElements; } bool IsSquare(void) const { return m_NumRows == m_NumCols; } friend MatrixF32 operator + (const MatrixF32& mat1, const MatrixF32& mat2) { if (!IsConforming(mat1, mat2)) throw std::runtime_error("Non-conforming operands: operator +"); MatrixF32 result(mat1.m_NumRows, mat1.m_NumCols); for (size_t i = 0; i < result.m_NumElements; i++) result.m_Data[i] = mat1.m_Data[i] + mat2.m_Data[i]; return result; } friend MatrixF32 operator * (const MatrixF32& mat1, const MatrixF32& mat2) { if (mat1.m_NumCols != mat2.m_NumRows) throw std::runtime_error("Non-conforming operands: operator *"); size_t m = mat1.m_NumCols; MatrixF32 result(mat1.m_NumRows, mat2.m_NumCols); for (size_t i = 0; i < result.m_NumRows; i++) { for (size_t j = 0; j < result.m_NumCols; j++) { float sum = 0; for (size_t k = 0; k < m; k++) { float val = mat1.m_Data[i * mat1.m_NumCols + k] * mat2.m_Data[k * mat2.m_NumCols + j]; sum += val; } result.m_Data[i * result.m_NumCols + j] = sum; } } return result; } // // For some operations, static functions are used instead of overloaded // operators in order to avoid inaccurate benchmark performance comparisons. // static void Add(MatrixF32& result, const MatrixF32& mat1, const MatrixF32& mat2) { if (!IsConforming(result, mat1) || !IsConforming(mat1, mat2)) throw std::runtime_error("Non-conforming operands: MatrixF32::Add"); for (size_t i = 0; i < result.m_NumElements; i++) result.m_Data[i] = mat1.m_Data[i] + mat2.m_Data[i]; } static void Mul(MatrixF32& result, const MatrixF32& mat1, const MatrixF32& mat2) { if (mat1.m_NumCols != mat2.m_NumRows) throw std::runtime_error("Non-conforming operands: MatrixF32::Mul"); if (result.m_NumRows != mat1.m_NumRows || result.m_NumCols != mat2.m_NumCols) throw std::runtime_error("Invalid matrix size: MatrixF32::Mul"); size_t m = mat1.m_NumCols; for (size_t i = 0; i < result.m_NumRows; i++) { for (size_t j = 0; j < result.m_NumCols; j++) { float sum = 0; for (size_t k = 0; k < m; k++) { float val = mat1.m_Data[i * mat1.m_NumCols + k] * mat2.m_Data[k * mat2.m_NumCols + j]; sum += val; } result.m_Data[i * result.m_NumCols + j] = sum; } } } static void Mul4x4(MatrixF32& result, const MatrixF32& mat1, const MatrixF32& mat2) { const size_t nrows = 4; const size_t ncols = 4; const size_t m = 4; for (size_t i = 0; i < nrows; i++) { for (size_t j = 0; j < ncols; j++) { float sum = 0; for (size_t k = 0; k < m; k++) { float val = mat1.m_Data[i * mat1.m_NumCols + k] * mat2.m_Data[k * mat2.m_NumCols + j]; sum += val; } result.m_Data[i * result.m_NumCols + j] = sum; } } } static void MulScalar(MatrixF32& result, const MatrixF32& mat, float val) { if (!IsConforming(result, mat)) throw std::runtime_error("Non-conforming operands: MatrixF32::MulScalar"); for (size_t i = 0; i < result.m_NumElements; i++) result.m_Data[i] = mat.m_Data[i] * val; } static void Transpose(MatrixF32& result, const MatrixF32& mat1) { if (result.m_NumRows != mat1.m_NumCols || result.m_NumCols != mat1.m_NumRows) throw std::runtime_error("Non-conforming operands: MatrixF32::Transpose"); for (size_t i = 0; i < result.m_NumRows; i++) { for (size_t j = 0; j < result.m_NumCols; j++) result.m_Data[i * result.m_NumCols + j] = mat1.m_Data[j * mat1.m_NumCols + i]; } } static bool IsConforming(const MatrixF32& mat1, const MatrixF32& mat2) { return mat1.m_NumRows == mat2.m_NumRows && mat1.m_NumCols == mat2.m_NumCols; } void Fill(float val) { for (size_t i = 0; i < m_NumElements; i++) m_Data[i] = val; } void FillRandomInt(unsigned int seed_val, int min_val, int max_val, bool exclude_zero) { std::uniform_int_distribution<> dist {min_val, max_val}; std::mt19937 rng {seed_val}; if (exclude_zero) { for (size_t i = 0; i < m_NumElements; i++) { int val; while ((val = dist(rng)) == 0) { } m_Data[i] = (float)val; } } else { for (size_t i = 0; i < m_NumElements; i++) m_Data[i] = (float)dist(rng); } } void RoundToZero(float epsilon) { for (size_t i = 0; i < m_NumElements; i++) { if (fabs(m_Data[i]) < epsilon) m_Data[i] = 0; } } void RoundToI(float epsilon) { for (size_t i = 0; i < m_NumRows; i++) { for (size_t j = 0; j < m_NumCols; j++) { size_t k = i * m_NumCols + j; if (i != j) { if (fabs(m_Data[k]) < epsilon) m_Data[k] = 0; } else { if (fabs(m_Data[k] - 1.0f) < epsilon) m_Data[k] = 1.0f; } } } } void SetCol(size_t col, const float* vals) { if (col >= m_NumCols) throw std::runtime_error("Invalid column index: MatrixF32::SetCol()"); for (size_t i = 0; i < m_NumRows; i++) m_Data[i * m_NumCols + col] = vals[i]; } void SetRow(size_t row, const float* vals) { if (row >= m_NumRows) throw std::runtime_error("Invalid row index: MatrixF32::SetRow()"); for (size_t j = 0; j < m_NumCols; j++) m_Data[row * m_NumCols + j] = vals[j]; } void SetI(void) { if (!IsSquare()) throw std::runtime_error("Square matrix required: MatrixF32::SetI()"); for (size_t i = 0; i < m_NumRows; i++) { for (size_t j = 0; j < m_NumCols; j++) m_Data[i * m_NumCols + j] = (i == j) ? 1.0f : 0.0f; } } void SetOstream(int w, const std::string& delim){ m_OstreamW = w; m_OstreamDelim = delim; } float Trace(void) const { if (!IsSquare()) throw std::runtime_error("Square matrix required: MatrixF32::Trace()"); float sum = 0; for (size_t i = 0; i < m_NumRows; i++) sum += m_Data[i * m_NumCols + i]; return sum; } float Trace4x4(void) const { if (m_NumRows != 4 || m_NumCols != 4) throw std::runtime_error("4x4 matrix required: MatrixF32::Trace()"); float sum = m_Data[0] + m_Data[5] + m_Data[10] + m_Data[15]; return sum; } friend std::ostream& operator << (std::ostream& os, const MatrixF32& mat) { for (size_t i = 0; i < mat.m_NumRows; i++) { for (size_t j = 0; j < mat.m_NumCols; j++) { os << std::setw(mat.m_OstreamW) << mat.m_Data[i * mat.m_NumCols + j]; if (j + 1 < mat.m_NumCols) os << mat.m_OstreamDelim; } os << std::endl; } return os; } }; |
NEON を使って4x4行列の乗算を行うプログラムの説明。
NEON を使って4x4行列の転置行列求めるプログラムの説明。
略