Modern Arm Assembly Language Programming: Covers Armv8-A 32-bit, 64-bit, and SIMD

by Daniel Kusswurm
2021.08.25: updated by
Up

Chapter 15: Armv8-64 SIMD Floating-Point Programming

Packed Floating-Point Arithmetic

NEON Co-processor の 128 bit レジスタを double (64 bit) 2個、float (32 bit) 4個、long (64 bit) 2個、int (32 bit) 4個、short (16 bit) 8個、byte (8 bit) 16個 として扱うための共用体 Vec128 を定義する。

Vec128.h
#pragma once
#include <string>
#include <cstdint>
#include <sstream>
#include <iomanip>

struct alignas(16) Vec128 {
 public:
  union {
    int8_t m_I8[16];
    int16_t m_I16[8];
    int32_t m_I32[4];
    int64_t m_I64[2];
    uint8_t m_U8[16];
    uint16_t m_U16[8];
    uint32_t m_U32[4];
    uint64_t m_U64[2];
    float m_F32[4];
    double m_F64[2];
  };
 private:
  template <typename T> std::string ToStringInt(const T* x, int n, int w) {
    std::ostringstream oss;
    for (int i=0; i<n; i++) {
      oss << std::setw(w) << (int64_t)x[i];
      if (i + 1 == n / 2) oss << "  |";
    }
    return oss.str();
  }
  template <typename T> std::string ToStringUint(const T* x, int n, int w) {
    std::ostringstream oss;
    for (int i=0; i<n; i++) {
      oss << std::setw(w) << (uint64_t)x[i];
      if (i + 1 == n / 2) oss << "  |";
    }
    return oss.str();
  }
  template <typename T> std::string ToStringHex(const T* x, int n, int w) {
    std::ostringstream oss;
    for (int i=0; i<n; i++) {
      const int w_temp = 16;
      std::ostringstream oss_temp;
      oss_temp << std::uppercase << std::hex << std::setfill('0');
      oss_temp << std::setw(w_temp) << (uint64_t)x[i];
      std::string s1 = oss_temp.str();
      std::string s2 = s1.substr(w_temp - sizeof(T) * 2);
      oss << std::setw(w) << s2;
      if (i + 1 == n / 2) oss << "  |";
    }
    return oss.str();
  }
  template <typename T> std::string ToStringFP(const T* x, int n, int w, int p) {
    std::ostringstream oss;
    oss << std::fixed << std::setprecision(p);
    for (int i=0; i<n; i++) {
      oss << std::setw(w) << x[i];
      if (i + 1 == n / 2) oss << "  |";
    }
    return oss.str();
  }
 public:
  std::string ToStringI8(void) {
    return ToStringInt(m_I8, sizeof(m_I8) / sizeof(int8_t), 4);
  }
  std::string ToStringI16(void) {
    return ToStringInt(m_I16, sizeof(m_I16) / sizeof(int16_t), 8);
  }
  std::string ToStringI32(void) {
    return ToStringInt(m_I32, sizeof(m_I32) / sizeof(int32_t), 16);
  }
  std::string ToStringI64(void) {
    return ToStringInt(m_I64, sizeof(m_I64) / sizeof(int64_t), 32);
  }
  std::string ToStringU8(void) {
    return ToStringUint(m_U8, sizeof(m_U8) / sizeof(uint8_t), 4);
  }
  std::string ToStringU16(void) {
    return ToStringUint(m_U16, sizeof(m_U16) / sizeof(uint16_t), 8);
  }
  std::string ToStringU32(void) {
    return ToStringUint(m_I32, sizeof(m_I32) / sizeof(int32_t), 16);
  }
  std::string ToStringU64(void) {
    return ToStringUint(m_I64, sizeof(m_I64) / sizeof(int64_t), 32);
  }
  std::string ToStringX8(void) {
    return ToStringHex(m_U8, sizeof(m_U8) / sizeof(uint8_t), 4);
  }
  std::string ToStringX16(void) {
    return ToStringHex(m_U16, sizeof(m_U16) / sizeof(uint16_t), 8);
  }
  std::string ToStringX32(void) {
    return ToStringHex(m_I32, sizeof(m_I32) / sizeof(int32_t), 16);
  }
  std::string ToStringX64(void) {
    return ToStringHex(m_I64, sizeof(m_I64) / sizeof(int64_t), 32);
  }
  std::string ToStringF32(void) {
    return ToStringFP(m_F32, sizeof(m_F32) / sizeof(float), 16, 6);
  }
  std::string ToStringF64(void) {
    return ToStringFP(m_F64, sizeof(m_F64) / sizeof(double), 32, 12);
  }

};

Basic Arithmetic

NEON を使って基本演算を行うプログラムの説明


Comparisons

NEON を使って比較演算を行うプログラムの説明


Conversions

NEON を使って型変換を行うプログラムの説明



Packed Floating-Point Arrays


Correlation Coefficient

NEON を使って相関係数の計算を行うプログラムの説明


Image Conversion RGB to Grayscale



Packed Floating-Point Matrices

AlignedMem.h
#pragma once
#include <cstdint>
#include <stdexcept>
#include <stdlib.h>

class AlignedMem {
public:
  static void* Allocate(size_t mem_size, size_t mem_alignment) {
    void *ptr;
    if (posix_memalign(&ptr, mem_alignment, mem_size) != 0)
      throw std::runtime_error("Memory allocation error: AllocateAlignedMem()");

    return ptr;
  }

  static void Release(void* p) {
    free(p);
  }

  template <typename T> static bool IsAligned(const T* p, size_t alignment) {
    if (p == nullptr)
      return false;

    if (((uintptr_t)p % alignment) != 0)
      return false;

    return true;
  }
};

template <class T> class AlignedArray {
  T* m_Data;
  size_t m_Size;

public:

  AlignedArray(void) = delete;
  AlignedArray(const AlignedArray& aa) = delete;
  AlignedArray(AlignedArray&& aa) = delete;
  AlignedArray& operator = (const AlignedArray& aa) = delete;
  AlignedArray& operator = (AlignedArray&& aa) = delete;

  AlignedArray(size_t size, size_t alignment) {
    m_Size = size;
    m_Data = (T*)AlignedMem::Allocate(size * sizeof(T), alignment);
  }

  ~AlignedArray() {
    AlignedMem::Release(m_Data);
  }

  T* Data(void)               { return m_Data; }
  size_t Size(void)           { return m_Size; }

  void Fill(T val) {
    for (size_t i = 0; i < m_Size; i++)
      m_Data[i] = val;
  }
};
MatrixF32.h
//-------------------------------------------------
//               MatrixF32.h
//-------------------------------------------------

#pragma once
#include <iostream>
#include <iomanip>
#include <random>
#include <string>
#include <cstring>
#include <stdexcept>
#include "AlignedMem.h"

class MatrixF32 {
  static const size_t c_Alignment = 16;

  size_t m_NumRows;
  size_t m_NumCols;
  size_t m_NumElements;
  size_t m_DataSize;
  float* m_Data;

  int m_OstreamW;
  std::string m_OstreamDelim;

  bool IsConforming(size_t num_rows, size_t num_cols) const {
    return m_NumRows == num_rows && m_NumCols == num_cols;
  }

  void Allocate(size_t num_rows, size_t num_cols) {
    m_NumRows = num_rows;
    m_NumCols = num_cols;
    m_NumElements = m_NumRows * m_NumCols;
    m_DataSize = m_NumElements * sizeof(float);

    if (m_NumElements == 0)
      m_Data = nullptr;
    else
      m_Data = (float*)AlignedMem::Allocate(m_DataSize, c_Alignment);

    SetOstream(10, "  ");
  }

  void Cleanup(void) {
    m_NumRows = m_NumCols = m_NumElements = m_DataSize = 0;
    m_Data = nullptr;
  }

  void Release(void) {
    if (m_Data != nullptr)
      AlignedMem::Release(m_Data);

    Cleanup();
  }

public:
  MatrixF32(void) {
    Allocate(0, 0);
  }

  MatrixF32(size_t num_rows, size_t num_cols) {
    Allocate(num_rows, num_cols);
    Fill(0);
  }

  MatrixF32(size_t num_rows, size_t num_cols, bool set_identity) {
    Allocate(num_rows, num_cols);
    Fill(0);

    if (set_identity && m_NumRows == m_NumCols) {
      for (size_t i = 0; i < num_rows; i++)
	m_Data[i * m_NumCols + i] = 1.0f;
    }
  }

  MatrixF32(const MatrixF32& mat) {
    Allocate(mat.m_NumRows, mat.m_NumCols);
    memcpy(m_Data, mat.m_Data, m_DataSize);
    SetOstream(mat.m_OstreamW, mat.m_OstreamDelim);
  }

  MatrixF32(MatrixF32&& mat) {
    m_NumRows = mat.m_NumRows;
    m_NumCols = mat.m_NumCols;
    m_NumElements = mat.m_NumElements;
    m_DataSize = mat.m_DataSize;
    m_Data = mat.m_Data;
    m_OstreamW = mat.m_OstreamW;
    m_OstreamDelim = mat.m_OstreamDelim;

    mat.Cleanup();
  }

  ~MatrixF32() {
    Release();
  }

  MatrixF32& operator = (const MatrixF32& mat) {
    if (this != &mat) {
      if (!IsConforming(*this, mat)) {
	Release();
	Allocate(mat.m_NumRows, mat.m_NumCols);
      }

      memcpy(m_Data, mat.m_Data, m_DataSize);
      SetOstream(mat.m_OstreamW, mat.m_OstreamDelim);
    }

    return *this;
  }

  MatrixF32& operator = (MatrixF32&& mat) {
    Release();

    m_NumRows = mat.m_NumRows;
    m_NumCols = mat.m_NumCols;
    m_NumElements = mat.m_NumElements;
    m_DataSize = mat.m_DataSize;
    m_Data = mat.m_Data;
    SetOstream(mat.m_OstreamW, mat.m_OstreamDelim);

    mat.Cleanup();
    return *this;
  }

  friend bool operator == (const MatrixF32& mat1, const MatrixF32& mat2) {
    if (!IsConforming(mat1, mat2))
      return false;

    return (memcmp(mat1.m_Data, mat2.m_Data, mat1.m_DataSize) == 0) ? true : false;
  }

  friend bool operator != (const MatrixF32& mat1, const MatrixF32& mat2) {
    if (!IsConforming(mat1, mat2))
      return false;

    return (memcmp(mat1.m_Data, mat2.m_Data, mat1.m_DataSize) != 0) ? true : false;
  }

  float* Data(void) { return m_Data; }
  const float* Data(void) const { return m_Data; }
  size_t GetNumRows(void) const { return m_NumRows; }
  size_t GetNumCols(void) const { return m_NumCols; }
  size_t GetNumElements(void) const { return m_NumElements; }
  bool IsSquare(void) const { return m_NumRows == m_NumCols; }

  friend MatrixF32 operator + (const MatrixF32& mat1, const MatrixF32& mat2) {
    if (!IsConforming(mat1, mat2))
      throw std::runtime_error("Non-conforming operands: operator +");

    MatrixF32 result(mat1.m_NumRows, mat1.m_NumCols);

    for (size_t i = 0; i < result.m_NumElements; i++)
      result.m_Data[i] = mat1.m_Data[i] + mat2.m_Data[i];

    return result;
  }

  friend MatrixF32 operator * (const MatrixF32& mat1, const MatrixF32& mat2) {
    if (mat1.m_NumCols != mat2.m_NumRows)
      throw std::runtime_error("Non-conforming operands: operator *");

    size_t m = mat1.m_NumCols;
    MatrixF32 result(mat1.m_NumRows, mat2.m_NumCols);

    for (size_t i = 0; i < result.m_NumRows; i++) {
      for (size_t j = 0; j < result.m_NumCols; j++) {
	float sum = 0;

	for (size_t k = 0; k < m; k++) {
	  float val = mat1.m_Data[i * mat1.m_NumCols + k] * mat2.m_Data[k * mat2.m_NumCols + j];
	  sum += val;
	}

	result.m_Data[i * result.m_NumCols + j] = sum;
      }
    }

    return result;
  }

  //
  // For some operations, static functions are used instead of overloaded
  // operators in order to avoid inaccurate benchmark performance comparisons.
  //

  static void Add(MatrixF32& result, const MatrixF32& mat1, const MatrixF32& mat2) {
    if (!IsConforming(result, mat1) || !IsConforming(mat1, mat2))
      throw std::runtime_error("Non-conforming operands: MatrixF32::Add");

    for (size_t i = 0; i < result.m_NumElements; i++)
      result.m_Data[i] = mat1.m_Data[i] + mat2.m_Data[i];
  }

  static void Mul(MatrixF32& result, const MatrixF32& mat1, const MatrixF32& mat2) {
    if (mat1.m_NumCols != mat2.m_NumRows)
      throw std::runtime_error("Non-conforming operands: MatrixF32::Mul");

    if (result.m_NumRows != mat1.m_NumRows || result.m_NumCols != mat2.m_NumCols)
      throw std::runtime_error("Invalid matrix size: MatrixF32::Mul");

    size_t m = mat1.m_NumCols;

    for (size_t i = 0; i < result.m_NumRows; i++) {
      for (size_t j = 0; j < result.m_NumCols; j++) {
	float sum = 0;

	for (size_t k = 0; k < m; k++) {
	  float val = mat1.m_Data[i * mat1.m_NumCols + k] * mat2.m_Data[k * mat2.m_NumCols + j];
	  sum += val;
	}

	result.m_Data[i * result.m_NumCols + j] = sum;
      }
    }
  }

  static void Mul4x4(MatrixF32& result, const MatrixF32& mat1, const MatrixF32& mat2) {
    const size_t nrows = 4;
    const size_t ncols = 4;
    const size_t m = 4;

    for (size_t i = 0; i < nrows; i++) {
      for (size_t j = 0; j < ncols; j++) {
	float sum = 0;

	for (size_t k = 0; k < m; k++) {
	  float val = mat1.m_Data[i * mat1.m_NumCols + k] * mat2.m_Data[k * mat2.m_NumCols + j];
	  sum += val;
	}

	result.m_Data[i * result.m_NumCols + j] = sum;
      }
    }
  }

  static void MulScalar(MatrixF32& result, const MatrixF32& mat, float val) {
    if (!IsConforming(result, mat))
      throw std::runtime_error("Non-conforming operands: MatrixF32::MulScalar");

    for (size_t i = 0; i < result.m_NumElements; i++)
      result.m_Data[i] = mat.m_Data[i] * val;
  }

  static void Transpose(MatrixF32& result, const MatrixF32& mat1) {
    if (result.m_NumRows != mat1.m_NumCols || result.m_NumCols != mat1.m_NumRows)
      throw std::runtime_error("Non-conforming operands: MatrixF32::Transpose");

    for (size_t i = 0; i < result.m_NumRows; i++) {
      for (size_t j = 0; j < result.m_NumCols; j++)
	result.m_Data[i * result.m_NumCols + j] = mat1.m_Data[j * mat1.m_NumCols + i];
    }
  }

  static bool IsConforming(const MatrixF32& mat1, const MatrixF32& mat2) {
    return mat1.m_NumRows == mat2.m_NumRows && mat1.m_NumCols == mat2.m_NumCols;
  }

  void Fill(float val) {
    for (size_t i = 0; i < m_NumElements; i++)
      m_Data[i] = val;
  }

  void FillRandomInt(unsigned int seed_val, int min_val, int max_val, bool exclude_zero) {
    std::uniform_int_distribution<> dist {min_val, max_val};
    std::mt19937 rng {seed_val};

    if (exclude_zero) {
      for (size_t i = 0; i < m_NumElements; i++) {
	int val;

	while ((val = dist(rng)) == 0) {
	}

	m_Data[i] = (float)val;
      }
    } else {
      for (size_t i = 0; i < m_NumElements; i++)
	m_Data[i] = (float)dist(rng);
    }
  }

  void RoundToZero(float epsilon) {
    for (size_t i = 0; i < m_NumElements; i++) {
      if (fabs(m_Data[i]) < epsilon)
	m_Data[i] = 0;
    }
  }

  void RoundToI(float epsilon) {
    for (size_t i = 0; i < m_NumRows; i++) {
      for (size_t j = 0; j < m_NumCols; j++) {
	size_t k = i * m_NumCols + j;

	if (i != j) {
	  if (fabs(m_Data[k]) < epsilon)
	    m_Data[k] = 0;
	} else {
	  if (fabs(m_Data[k] - 1.0f) < epsilon)
	    m_Data[k] = 1.0f;
	}
      }
    }
  }

  void SetCol(size_t col, const float* vals) {
    if (col >= m_NumCols)
      throw std::runtime_error("Invalid column index: MatrixF32::SetCol()");

    for (size_t i = 0; i < m_NumRows; i++)
      m_Data[i * m_NumCols + col] = vals[i];
  }

  void SetRow(size_t row, const float* vals) {
    if (row >= m_NumRows)
      throw std::runtime_error("Invalid row index: MatrixF32::SetRow()");

    for (size_t j = 0; j < m_NumCols; j++)
      m_Data[row * m_NumCols + j] = vals[j];
  }

  void SetI(void) {
    if (!IsSquare())
      throw std::runtime_error("Square matrix required: MatrixF32::SetI()");

    for (size_t i = 0; i < m_NumRows; i++) {
      for (size_t j = 0; j < m_NumCols; j++)
	m_Data[i * m_NumCols + j] = (i == j) ? 1.0f : 0.0f;
    }
  }

  void SetOstream(int w, const std::string& delim){
    m_OstreamW = w;
    m_OstreamDelim = delim;
  }

  float Trace(void) const {
    if (!IsSquare())
      throw std::runtime_error("Square matrix required: MatrixF32::Trace()");

    float sum = 0;

    for (size_t i = 0; i < m_NumRows; i++)
      sum += m_Data[i * m_NumCols + i];

    return sum;
  }

  float Trace4x4(void) const {
    if (m_NumRows != 4 || m_NumCols != 4)
      throw std::runtime_error("4x4 matrix required: MatrixF32::Trace()");

    float sum = m_Data[0] + m_Data[5] + m_Data[10] + m_Data[15];

    return sum;
  }

  friend std::ostream& operator << (std::ostream& os, const MatrixF32& mat) {
    for (size_t i = 0; i < mat.m_NumRows; i++) {
      for (size_t j = 0; j < mat.m_NumCols; j++) {
	os << std::setw(mat.m_OstreamW) << mat.m_Data[i * mat.m_NumCols + j];

	if (j + 1 < mat.m_NumCols)
	  os << mat.m_OstreamDelim;
      }

      os << std::endl;
    }

    return os;
  }
};

Matrix Multiplication

NEON を使って4x4行列の乗算を行うプログラムの説明


Matrix Trace and Transposition

NEON を使って4x4行列の転置行列求めるプログラムの説明


Summary



http://karel.tsuda.ac.jp/