NDArray with strides + NDArrayObject + Models + Exceptions in IRRT. #506

Closed
lyken wants to merge 51 commits from ndstrides-intro into ndstrides
Showing only changes of commit 331ab8a946 - Show all commits

View File

@ -10,25 +10,28 @@ using NDIndex = uint32_t;
// The type of an index or a value describing the length of a range/slice is always `int32_t`. // The type of an index or a value describing the length of a range/slice is always `int32_t`.
using SliceIndex = int32_t; using SliceIndex = int32_t;
namespace { namespace
template <typename T> {
const T& max(const T& a, const T& b) { template <typename T> const T &max(const T &a, const T &b)
{
return a > b ? a : b; return a > b ? a : b;
} }
template <typename T> template <typename T> const T &min(const T &a, const T &b)
const T& min(const T& a, const T& b) { {
return a > b ? b : a; return a > b ? b : a;
} }
// adapted from GNU Scientific Library: https://git.savannah.gnu.org/cgit/gsl.git/tree/sys/pow_int.c // adapted from GNU Scientific Library: https://git.savannah.gnu.org/cgit/gsl.git/tree/sys/pow_int.c
// need to make sure `exp >= 0` before calling this function // need to make sure `exp >= 0` before calling this function
template <typename T> template <typename T> T __nac3_int_exp_impl(T base, T exp)
T __nac3_int_exp_impl(T base, T exp) { {
T res = 1; T res = 1;
/* repeated squaring method */ /* repeated squaring method */
do { do
if (exp & 1) { {
if (exp & 1)
{
res *= base; /* for n odd */ res *= base; /* for n odd */
} }
exp >>= 1; exp >>= 1;
@ -38,16 +41,13 @@ T __nac3_int_exp_impl(T base, T exp) {
} }
template <typename SizeT> template <typename SizeT>
SizeT __nac3_ndarray_calc_size_impl( SizeT __nac3_ndarray_calc_size_impl(const SizeT *list_data, SizeT list_len, SizeT begin_idx, SizeT end_idx)
const SizeT* list_data, {
SizeT list_len,
SizeT begin_idx,
SizeT end_idx
) {
__builtin_assume(end_idx <= list_len); __builtin_assume(end_idx <= list_len);
SizeT num_elems = 1; SizeT num_elems = 1;
for (SizeT i = begin_idx; i < end_idx; ++i) { for (SizeT i = begin_idx; i < end_idx; ++i)
{
SizeT val = list_data[i]; SizeT val = list_data[i];
__builtin_assume(val > 0); __builtin_assume(val > 0);
num_elems *= val; num_elems *= val;
@ -56,14 +56,11 @@ SizeT __nac3_ndarray_calc_size_impl(
} }
template <typename SizeT> template <typename SizeT>
void __nac3_ndarray_calc_nd_indices_impl( void __nac3_ndarray_calc_nd_indices_impl(SizeT index, const SizeT *dims, SizeT num_dims, NDIndex *idxs)
SizeT index, {
const SizeT* dims,
SizeT num_dims,
NDIndex* idxs
) {
SizeT stride = 1; SizeT stride = 1;
for (SizeT dim = 0; dim < num_dims; dim++) { for (SizeT dim = 0; dim < num_dims; dim++)
{
SizeT i = num_dims - dim - 1; SizeT i = num_dims - dim - 1;
__builtin_assume(dims[i] > 0); __builtin_assume(dims[i] > 0);
idxs[i] = (index / stride) % dims[i]; idxs[i] = (index / stride) % dims[i];
@ -72,17 +69,15 @@ void __nac3_ndarray_calc_nd_indices_impl(
} }
template <typename SizeT> template <typename SizeT>
SizeT __nac3_ndarray_flatten_index_impl( SizeT __nac3_ndarray_flatten_index_impl(const SizeT *dims, SizeT num_dims, const NDIndex *indices, SizeT num_indices)
const SizeT* dims, {
SizeT num_dims,
const NDIndex* indices,
SizeT num_indices
) {
SizeT idx = 0; SizeT idx = 0;
SizeT stride = 1; SizeT stride = 1;
for (SizeT i = 0; i < num_dims; ++i) { for (SizeT i = 0; i < num_dims; ++i)
{
SizeT ri = num_dims - i - 1; SizeT ri = num_dims - i - 1;
if (ri < num_indices) { if (ri < num_indices)
{
idx += stride * indices[ri]; idx += stride * indices[ri];
} }
@ -93,322 +88,297 @@ SizeT __nac3_ndarray_flatten_index_impl(
} }
template <typename SizeT> template <typename SizeT>
void __nac3_ndarray_calc_broadcast_impl( void __nac3_ndarray_calc_broadcast_impl(const SizeT *lhs_dims, SizeT lhs_ndims, const SizeT *rhs_dims, SizeT rhs_ndims,
const SizeT* lhs_dims, SizeT *out_dims)
SizeT lhs_ndims, {
const SizeT* rhs_dims,
SizeT rhs_ndims,
SizeT* out_dims
) {
SizeT max_ndims = lhs_ndims > rhs_ndims ? lhs_ndims : rhs_ndims; SizeT max_ndims = lhs_ndims > rhs_ndims ? lhs_ndims : rhs_ndims;
for (SizeT i = 0; i < max_ndims; ++i) { for (SizeT i = 0; i < max_ndims; ++i)
const SizeT* lhs_dim_sz = i < lhs_ndims ? &lhs_dims[lhs_ndims - i - 1] : nullptr; {
const SizeT* rhs_dim_sz = i < rhs_ndims ? &rhs_dims[rhs_ndims - i - 1] : nullptr; const SizeT *lhs_dim_sz = i < lhs_ndims ? &lhs_dims[lhs_ndims - i - 1] : nullptr;
SizeT* out_dim = &out_dims[max_ndims - i - 1]; const SizeT *rhs_dim_sz = i < rhs_ndims ? &rhs_dims[rhs_ndims - i - 1] : nullptr;
SizeT *out_dim = &out_dims[max_ndims - i - 1];
if (lhs_dim_sz == nullptr) { if (lhs_dim_sz == nullptr)
{
*out_dim = *rhs_dim_sz; *out_dim = *rhs_dim_sz;
} else if (rhs_dim_sz == nullptr) { }
else if (rhs_dim_sz == nullptr)
{
*out_dim = *lhs_dim_sz; *out_dim = *lhs_dim_sz;
} else if (*lhs_dim_sz == 1) { }
else if (*lhs_dim_sz == 1)
{
*out_dim = *rhs_dim_sz; *out_dim = *rhs_dim_sz;
} else if (*rhs_dim_sz == 1) { }
else if (*rhs_dim_sz == 1)
{
*out_dim = *lhs_dim_sz; *out_dim = *lhs_dim_sz;
} else if (*lhs_dim_sz == *rhs_dim_sz) { }
else if (*lhs_dim_sz == *rhs_dim_sz)
{
*out_dim = *lhs_dim_sz; *out_dim = *lhs_dim_sz;
} else { }
else
{
__builtin_unreachable(); __builtin_unreachable();
} }
} }
} }
template <typename SizeT> template <typename SizeT>
void __nac3_ndarray_calc_broadcast_idx_impl( void __nac3_ndarray_calc_broadcast_idx_impl(const SizeT *src_dims, SizeT src_ndims, const NDIndex *in_idx,
const SizeT* src_dims, NDIndex *out_idx)
SizeT src_ndims, {
const NDIndex* in_idx, for (SizeT i = 0; i < src_ndims; ++i)
NDIndex* out_idx {
) {
for (SizeT i = 0; i < src_ndims; ++i) {
SizeT src_i = src_ndims - i - 1; SizeT src_i = src_ndims - i - 1;
out_idx[src_i] = src_dims[src_i] == 1 ? 0 : in_idx[src_i]; out_idx[src_i] = src_dims[src_i] == 1 ? 0 : in_idx[src_i];
} }
} }
} // namespace } // namespace
extern "C" { extern "C"
#define DEF_nac3_int_exp_(T) \ {
T __nac3_int_exp_##T(T base, T exp) {\ #define DEF_nac3_int_exp_(T) \
return __nac3_int_exp_impl(base, exp);\ T __nac3_int_exp_##T(T base, T exp) \
{ \
return __nac3_int_exp_impl(base, exp); \
} }
DEF_nac3_int_exp_(int32_t) DEF_nac3_int_exp_(int32_t) DEF_nac3_int_exp_(int64_t) DEF_nac3_int_exp_(uint32_t) DEF_nac3_int_exp_(uint64_t)
DEF_nac3_int_exp_(int64_t)
DEF_nac3_int_exp_(uint32_t)
DEF_nac3_int_exp_(uint64_t)
SliceIndex __nac3_slice_index_bound(SliceIndex i, const SliceIndex len) { SliceIndex __nac3_slice_index_bound(SliceIndex i, const SliceIndex len)
if (i < 0) { {
i = len + i; if (i < 0)
} {
if (i < 0) { i = len + i;
return 0;
} else if (i > len) {
return len;
}
return i;
}
SliceIndex __nac3_range_slice_len(
const SliceIndex start,
const SliceIndex end,
const SliceIndex step
) {
SliceIndex diff = end - start;
if (diff > 0 && step > 0) {
return ((diff - 1) / step) + 1;
} else if (diff < 0 && step < 0) {
return ((diff + 1) / step) + 1;
} else {
return 0;
}
}
// Handle list assignment and dropping part of the list when
// both dest_step and src_step are +1.
// - All the index must *not* be out-of-bound or negative,
// - The end index is *inclusive*,
// - The length of src and dest slice size should already
// be checked: if dest.step == 1 then len(src) <= len(dest) else len(src) == len(dest)
SliceIndex __nac3_list_slice_assign_var_size(
SliceIndex dest_start,
SliceIndex dest_end,
SliceIndex dest_step,
uint8_t* dest_arr,
SliceIndex dest_arr_len,
SliceIndex src_start,
SliceIndex src_end,
SliceIndex src_step,
uint8_t* src_arr,
SliceIndex src_arr_len,
const SliceIndex size
) {
/* if dest_arr_len == 0, do nothing since we do not support extending list */
if (dest_arr_len == 0) return dest_arr_len;
/* if both step is 1, memmove directly, handle the dropping of the list, and shrink size */
if (src_step == dest_step && dest_step == 1) {
const SliceIndex src_len = (src_end >= src_start) ? (src_end - src_start + 1) : 0;
const SliceIndex dest_len = (dest_end >= dest_start) ? (dest_end - dest_start + 1) : 0;
if (src_len > 0) {
__builtin_memmove(
dest_arr + dest_start * size,
src_arr + src_start * size,
src_len * size
);
} }
if (dest_len > 0) { if (i < 0)
/* dropping */ {
__builtin_memmove( return 0;
dest_arr + (dest_start + src_len) * size,
dest_arr + (dest_end + 1) * size,
(dest_arr_len - dest_end - 1) * size
);
} }
/* shrink size */ else if (i > len)
return dest_arr_len - (dest_len - src_len); {
return len;
}
return i;
} }
/* if two range overlaps, need alloca */
uint8_t need_alloca = SliceIndex __nac3_range_slice_len(const SliceIndex start, const SliceIndex end, const SliceIndex step)
(dest_arr == src_arr) {
&& !( SliceIndex diff = end - start;
max(dest_start, dest_end) < min(src_start, src_end) if (diff > 0 && step > 0)
|| max(src_start, src_end) < min(dest_start, dest_end) {
); return ((diff - 1) / step) + 1;
if (need_alloca) { }
uint8_t* tmp = reinterpret_cast<uint8_t *>(__builtin_alloca(src_arr_len * size)); else if (diff < 0 && step < 0)
__builtin_memcpy(tmp, src_arr, src_arr_len * size); {
src_arr = tmp; return ((diff + 1) / step) + 1;
} }
SliceIndex src_ind = src_start; else
SliceIndex dest_ind = dest_start; {
for (; return 0;
(src_step > 0) ? (src_ind <= src_end) : (src_ind >= src_end);
src_ind += src_step, dest_ind += dest_step
) {
/* for constant optimization */
if (size == 1) {
__builtin_memcpy(dest_arr + dest_ind, src_arr + src_ind, 1);
} else if (size == 4) {
__builtin_memcpy(dest_arr + dest_ind * 4, src_arr + src_ind * 4, 4);
} else if (size == 8) {
__builtin_memcpy(dest_arr + dest_ind * 8, src_arr + src_ind * 8, 8);
} else {
/* memcpy for var size, cannot overlap after previous alloca */
__builtin_memcpy(dest_arr + dest_ind * size, src_arr + src_ind * size, size);
} }
} }
/* only dest_step == 1 can we shrink the dest list. */
/* size should be ensured prior to calling this function */
if (dest_step == 1 && dest_end >= dest_start) {
__builtin_memmove(
dest_arr + dest_ind * size,
dest_arr + (dest_end + 1) * size,
(dest_arr_len - dest_end - 1) * size
);
return dest_arr_len - (dest_end - dest_ind) - 1;
}
return dest_arr_len;
}
int32_t __nac3_isinf(double x) { // Handle list assignment and dropping part of the list when
return __builtin_isinf(x); // both dest_step and src_step are +1.
} // - All the index must *not* be out-of-bound or negative,
// - The end index is *inclusive*,
int32_t __nac3_isnan(double x) { // - The length of src and dest slice size should already
return __builtin_isnan(x); // be checked: if dest.step == 1 then len(src) <= len(dest) else len(src) == len(dest)
} SliceIndex __nac3_list_slice_assign_var_size(SliceIndex dest_start, SliceIndex dest_end, SliceIndex dest_step,
uint8_t *dest_arr, SliceIndex dest_arr_len, SliceIndex src_start,
double tgamma(double arg); SliceIndex src_end, SliceIndex src_step, uint8_t *src_arr,
SliceIndex src_arr_len, const SliceIndex size)
double __nac3_gamma(double z) { {
// Handling for denormals /* if dest_arr_len == 0, do nothing since we do not support extending list */
// | x | Python gamma(x) | C tgamma(x) | if (dest_arr_len == 0)
// --- | ----------------- | --------------- | ----------- | return dest_arr_len;
// (1) | nan | nan | nan | /* if both step is 1, memmove directly, handle the dropping of the list, and shrink size */
// (2) | -inf | -inf | inf | if (src_step == dest_step && dest_step == 1)
// (3) | inf | inf | inf | {
// (4) | 0.0 | inf | inf | const SliceIndex src_len = (src_end >= src_start) ? (src_end - src_start + 1) : 0;
// (5) | {-1.0, -2.0, ...} | inf | nan | const SliceIndex dest_len = (dest_end >= dest_start) ? (dest_end - dest_start + 1) : 0;
if (src_len > 0)
// (1)-(3) {
if (__builtin_isinf(z) || __builtin_isnan(z)) { __builtin_memmove(dest_arr + dest_start * size, src_arr + src_start * size, src_len * size);
return z; }
if (dest_len > 0)
{
/* dropping */
__builtin_memmove(dest_arr + (dest_start + src_len) * size, dest_arr + (dest_end + 1) * size,
(dest_arr_len - dest_end - 1) * size);
}
/* shrink size */
return dest_arr_len - (dest_len - src_len);
}
/* if two range overlaps, need alloca */
uint8_t need_alloca = (dest_arr == src_arr) && !(max(dest_start, dest_end) < min(src_start, src_end) ||
max(src_start, src_end) < min(dest_start, dest_end));
if (need_alloca)
{
uint8_t *tmp = reinterpret_cast<uint8_t *>(__builtin_alloca(src_arr_len * size));
__builtin_memcpy(tmp, src_arr, src_arr_len * size);
src_arr = tmp;
}
SliceIndex src_ind = src_start;
SliceIndex dest_ind = dest_start;
for (; (src_step > 0) ? (src_ind <= src_end) : (src_ind >= src_end); src_ind += src_step, dest_ind += dest_step)
{
/* for constant optimization */
if (size == 1)
{
__builtin_memcpy(dest_arr + dest_ind, src_arr + src_ind, 1);
}
else if (size == 4)
{
__builtin_memcpy(dest_arr + dest_ind * 4, src_arr + src_ind * 4, 4);
}
else if (size == 8)
{
__builtin_memcpy(dest_arr + dest_ind * 8, src_arr + src_ind * 8, 8);
}
else
{
/* memcpy for var size, cannot overlap after previous alloca */
__builtin_memcpy(dest_arr + dest_ind * size, src_arr + src_ind * size, size);
}
}
/* only dest_step == 1 can we shrink the dest list. */
/* size should be ensured prior to calling this function */
if (dest_step == 1 && dest_end >= dest_start)
{
__builtin_memmove(dest_arr + dest_ind * size, dest_arr + (dest_end + 1) * size,
(dest_arr_len - dest_end - 1) * size);
return dest_arr_len - (dest_end - dest_ind) - 1;
}
return dest_arr_len;
} }
double v = tgamma(z); int32_t __nac3_isinf(double x)
{
// (4)-(5) return __builtin_isinf(x);
return __builtin_isinf(v) || __builtin_isnan(v) ? __builtin_inf() : v;
}
double lgamma(double arg);
double __nac3_gammaln(double x) {
// libm's handling of value overflows differs from scipy:
// - scipy: gammaln(-inf) -> -inf
// - libm : lgamma(-inf) -> inf
if (__builtin_isinf(x)) {
return x;
} }
return lgamma(x); int32_t __nac3_isnan(double x)
} {
return __builtin_isnan(x);
double j0(double x);
double __nac3_j0(double x) {
// libm's handling of value overflows differs from scipy:
// - scipy: j0(inf) -> nan
// - libm : j0(inf) -> 0.0
if (__builtin_isinf(x)) {
return __builtin_nan("");
} }
return j0(x); double tgamma(double arg);
}
uint32_t __nac3_ndarray_calc_size( double __nac3_gamma(double z)
const uint32_t* list_data, {
uint32_t list_len, // Handling for denormals
uint32_t begin_idx, // | x | Python gamma(x) | C tgamma(x) |
uint32_t end_idx // --- | ----------------- | --------------- | ----------- |
) { // (1) | nan | nan | nan |
return __nac3_ndarray_calc_size_impl(list_data, list_len, begin_idx, end_idx); // (2) | -inf | -inf | inf |
} // (3) | inf | inf | inf |
// (4) | 0.0 | inf | inf |
// (5) | {-1.0, -2.0, ...} | inf | nan |
uint64_t __nac3_ndarray_calc_size64( // (1)-(3)
const uint64_t* list_data, if (__builtin_isinf(z) || __builtin_isnan(z))
uint64_t list_len, {
uint64_t begin_idx, return z;
uint64_t end_idx }
) {
return __nac3_ndarray_calc_size_impl(list_data, list_len, begin_idx, end_idx);
}
void __nac3_ndarray_calc_nd_indices( double v = tgamma(z);
uint32_t index,
const uint32_t* dims,
uint32_t num_dims,
NDIndex* idxs
) {
__nac3_ndarray_calc_nd_indices_impl(index, dims, num_dims, idxs);
}
void __nac3_ndarray_calc_nd_indices64( // (4)-(5)
uint64_t index, return __builtin_isinf(v) || __builtin_isnan(v) ? __builtin_inf() : v;
const uint64_t* dims, }
uint64_t num_dims,
NDIndex* idxs
) {
__nac3_ndarray_calc_nd_indices_impl(index, dims, num_dims, idxs);
}
uint32_t __nac3_ndarray_flatten_index( double lgamma(double arg);
const uint32_t* dims,
uint32_t num_dims,
const NDIndex* indices,
uint32_t num_indices
) {
return __nac3_ndarray_flatten_index_impl(dims, num_dims, indices, num_indices);
}
uint64_t __nac3_ndarray_flatten_index64( double __nac3_gammaln(double x)
const uint64_t* dims, {
uint64_t num_dims, // libm's handling of value overflows differs from scipy:
const NDIndex* indices, // - scipy: gammaln(-inf) -> -inf
uint64_t num_indices // - libm : lgamma(-inf) -> inf
) {
return __nac3_ndarray_flatten_index_impl(dims, num_dims, indices, num_indices);
}
void __nac3_ndarray_calc_broadcast( if (__builtin_isinf(x))
const uint32_t* lhs_dims, {
uint32_t lhs_ndims, return x;
const uint32_t* rhs_dims, }
uint32_t rhs_ndims,
uint32_t* out_dims
) {
return __nac3_ndarray_calc_broadcast_impl(lhs_dims, lhs_ndims, rhs_dims, rhs_ndims, out_dims);
}
void __nac3_ndarray_calc_broadcast64( return lgamma(x);
const uint64_t* lhs_dims, }
uint64_t lhs_ndims,
const uint64_t* rhs_dims,
uint64_t rhs_ndims,
uint64_t* out_dims
) {
return __nac3_ndarray_calc_broadcast_impl(lhs_dims, lhs_ndims, rhs_dims, rhs_ndims, out_dims);
}
void __nac3_ndarray_calc_broadcast_idx( double j0(double x);
const uint32_t* src_dims,
uint32_t src_ndims,
const NDIndex* in_idx,
NDIndex* out_idx
) {
__nac3_ndarray_calc_broadcast_idx_impl(src_dims, src_ndims, in_idx, out_idx);
}
void __nac3_ndarray_calc_broadcast_idx64( double __nac3_j0(double x)
const uint64_t* src_dims, {
uint64_t src_ndims, // libm's handling of value overflows differs from scipy:
const NDIndex* in_idx, // - scipy: j0(inf) -> nan
NDIndex* out_idx // - libm : j0(inf) -> 0.0
) {
__nac3_ndarray_calc_broadcast_idx_impl(src_dims, src_ndims, in_idx, out_idx); if (__builtin_isinf(x))
} {
} // extern "C" return __builtin_nan("");
}
return j0(x);
}
uint32_t __nac3_ndarray_calc_size(const uint32_t *list_data, uint32_t list_len, uint32_t begin_idx,
uint32_t end_idx)
{
return __nac3_ndarray_calc_size_impl(list_data, list_len, begin_idx, end_idx);
}
uint64_t __nac3_ndarray_calc_size64(const uint64_t *list_data, uint64_t list_len, uint64_t begin_idx,
uint64_t end_idx)
{
return __nac3_ndarray_calc_size_impl(list_data, list_len, begin_idx, end_idx);
}
void __nac3_ndarray_calc_nd_indices(uint32_t index, const uint32_t *dims, uint32_t num_dims, NDIndex *idxs)
{
__nac3_ndarray_calc_nd_indices_impl(index, dims, num_dims, idxs);
}
void __nac3_ndarray_calc_nd_indices64(uint64_t index, const uint64_t *dims, uint64_t num_dims, NDIndex *idxs)
{
__nac3_ndarray_calc_nd_indices_impl(index, dims, num_dims, idxs);
}
uint32_t __nac3_ndarray_flatten_index(const uint32_t *dims, uint32_t num_dims, const NDIndex *indices,
uint32_t num_indices)
{
return __nac3_ndarray_flatten_index_impl(dims, num_dims, indices, num_indices);
}
uint64_t __nac3_ndarray_flatten_index64(const uint64_t *dims, uint64_t num_dims, const NDIndex *indices,
uint64_t num_indices)
{
return __nac3_ndarray_flatten_index_impl(dims, num_dims, indices, num_indices);
}
void __nac3_ndarray_calc_broadcast(const uint32_t *lhs_dims, uint32_t lhs_ndims, const uint32_t *rhs_dims,
uint32_t rhs_ndims, uint32_t *out_dims)
{
return __nac3_ndarray_calc_broadcast_impl(lhs_dims, lhs_ndims, rhs_dims, rhs_ndims, out_dims);
}
void __nac3_ndarray_calc_broadcast64(const uint64_t *lhs_dims, uint64_t lhs_ndims, const uint64_t *rhs_dims,
uint64_t rhs_ndims, uint64_t *out_dims)
{
return __nac3_ndarray_calc_broadcast_impl(lhs_dims, lhs_ndims, rhs_dims, rhs_ndims, out_dims);
}
void __nac3_ndarray_calc_broadcast_idx(const uint32_t *src_dims, uint32_t src_ndims, const NDIndex *in_idx,
NDIndex *out_idx)
{
__nac3_ndarray_calc_broadcast_idx_impl(src_dims, src_ndims, in_idx, out_idx);
}
void __nac3_ndarray_calc_broadcast_idx64(const uint64_t *src_dims, uint64_t src_ndims, const NDIndex *in_idx,
NDIndex *out_idx)
{
__nac3_ndarray_calc_broadcast_idx_impl(src_dims, src_ndims, in_idx, out_idx);
}
} // extern "C"