Merge pull request #1081 from smr97/dev

Make sparse-times-sparse faster
This commit is contained in:
Andreas Borgen Longva 2022-05-30 15:14:42 +02:00 committed by GitHub
commit dd801567f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 233 additions and 48 deletions

View File

@ -3,7 +3,7 @@ use crate::csr::CsrMatrix;
use crate::ops::serial::{ use crate::ops::serial::{
spadd_csc_prealloc, spadd_csr_prealloc, spadd_pattern, spmm_csc_dense, spmm_csc_pattern, spadd_csc_prealloc, spadd_csr_prealloc, spadd_pattern, spmm_csc_dense, spmm_csc_pattern,
spmm_csc_prealloc, spmm_csr_dense, spmm_csr_pattern, spmm_csr_prealloc, spmm_csc_prealloc_unchecked, spmm_csr_dense, spmm_csr_pattern, spmm_csr_prealloc_unchecked,
}; };
use crate::ops::Op; use crate::ops::Op;
use nalgebra::allocator::Allocator; use nalgebra::allocator::Allocator;
@ -112,9 +112,9 @@ macro_rules! impl_spmm {
} }
} }
impl_spmm!(CsrMatrix, spmm_csr_pattern, spmm_csr_prealloc); impl_spmm!(CsrMatrix, spmm_csr_pattern, spmm_csr_prealloc_unchecked);
// Need to switch order of operations for CSC pattern // Need to switch order of operations for CSC pattern
impl_spmm!(CscMatrix, spmm_csc_pattern, spmm_csc_prealloc); impl_spmm!(CscMatrix, spmm_csc_pattern, spmm_csc_prealloc_unchecked);
/// Implements Scalar * Matrix operations for *concrete* scalar types. The reason this is necessary /// Implements Scalar * Matrix operations for *concrete* scalar types. The reason this is necessary
/// is that we are not able to implement Mul<Matrix<T>> for all T generically due to orphan rules. /// is that we are not able to implement Mul<Matrix<T>> for all T generically due to orphan rules.

View File

@ -20,6 +20,51 @@ fn spmm_cs_unexpected_entry() -> OperationError {
/// reversed (since transpose(AB) = transpose(B) * transpose(A) and CSC(A) = transpose(CSR(A)). /// reversed (since transpose(AB) = transpose(B) * transpose(A) and CSC(A) = transpose(CSR(A)).
/// ///
/// We assume here that the matrices have already been verified to be dimensionally compatible. /// We assume here that the matrices have already been verified to be dimensionally compatible.
pub fn spmm_cs_prealloc_unchecked<T>(
beta: T,
c: &mut CsMatrix<T>,
alpha: T,
a: &CsMatrix<T>,
b: &CsMatrix<T>,
) -> Result<(), OperationError>
where
T: Scalar + ClosedAdd + ClosedMul + Zero + One,
{
assert_eq!(c.pattern().major_dim(), a.pattern().major_dim());
assert_eq!(c.pattern().minor_dim(), b.pattern().minor_dim());
let some_val = Zero::zero();
let mut scratchpad_values: Vec<T> = vec![some_val; b.pattern().minor_dim()];
for i in 0..c.pattern().major_dim() {
let a_lane_i = a.get_lane(i).unwrap();
let mut c_lane_i = c.get_lane_mut(i).unwrap();
for (&k, a_ik) in a_lane_i.minor_indices().iter().zip(a_lane_i.values()) {
let b_lane_k = b.get_lane(k).unwrap();
let alpha_aik = alpha.clone() * a_ik.clone();
for (j, b_kj) in b_lane_k.minor_indices().iter().zip(b_lane_k.values()) {
// use a dense scatter vector to accumulate non-zeros quickly
unsafe {
*scratchpad_values.get_unchecked_mut(*j) += alpha_aik.clone() * b_kj.clone();
}
}
}
//Get indices from C pattern and gather from the dense scratchpad_values
let (indices, values) = c_lane_i.indices_and_values_mut();
values
.iter_mut()
.zip(indices)
.for_each(|(output_ref, index)| unsafe {
*output_ref = beta.clone() * output_ref.clone()
+ scratchpad_values.get_unchecked(*index).clone();
*scratchpad_values.get_unchecked_mut(*index) = Zero::zero();
});
}
Ok(())
}
pub fn spmm_cs_prealloc<T>( pub fn spmm_cs_prealloc<T>(
beta: T, beta: T,
c: &mut CsMatrix<T>, c: &mut CsMatrix<T>,

View File

@ -1,5 +1,7 @@
use crate::csc::CscMatrix; use crate::csc::CscMatrix;
use crate::ops::serial::cs::{spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc}; use crate::ops::serial::cs::{
spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc, spmm_cs_prealloc_unchecked,
};
use crate::ops::serial::{OperationError, OperationErrorKind}; use crate::ops::serial::{OperationError, OperationErrorKind};
use crate::ops::Op; use crate::ops::Op;
use nalgebra::{ClosedAdd, ClosedMul, DMatrixSlice, DMatrixSliceMut, RealField, Scalar}; use nalgebra::{ClosedAdd, ClosedMul, DMatrixSlice, DMatrixSliceMut, RealField, Scalar};
@ -83,14 +85,65 @@ where
{ {
assert_compatible_spmm_dims!(c, a, b); assert_compatible_spmm_dims!(c, a, b);
use Op::{NoOp, Transpose}; use Op::NoOp;
match (&a, &b) { match (&a, &b) {
(NoOp(ref a), NoOp(ref b)) => { (NoOp(ref a), NoOp(ref b)) => {
// Note: We have to reverse the order for CSC matrices // Note: We have to reverse the order for CSC matrices
spmm_cs_prealloc(beta, &mut c.cs, alpha, &b.cs, &a.cs) spmm_cs_prealloc(beta, &mut c.cs, alpha, &b.cs, &a.cs)
} }
_ => { _ => spmm_csc_transposed(beta, c, alpha, a, b, spmm_csc_prealloc),
}
}
/// Faster sparse-sparse matrix multiplication, `C <- beta * C + alpha * op(A) * op(B)`.
/// This will not return an error even if the patterns don't match.
/// Should be used for situations where pattern creation immediately preceeds multiplication.
///
/// Panics if the dimensions of the matrices involved are not compatible with the expression.
pub fn spmm_csc_prealloc_unchecked<T>(
beta: T,
c: &mut CscMatrix<T>,
alpha: T,
a: Op<&CscMatrix<T>>,
b: Op<&CscMatrix<T>>,
) -> Result<(), OperationError>
where
T: Scalar + ClosedAdd + ClosedMul + Zero + One,
{
assert_compatible_spmm_dims!(c, a, b);
use Op::NoOp;
match (&a, &b) {
(NoOp(ref a), NoOp(ref b)) => {
// Note: We have to reverse the order for CSC matrices
spmm_cs_prealloc_unchecked(beta, &mut c.cs, alpha, &b.cs, &a.cs)
}
_ => spmm_csc_transposed(beta, c, alpha, a, b, spmm_csc_prealloc_unchecked),
}
}
fn spmm_csc_transposed<T, F>(
beta: T,
c: &mut CscMatrix<T>,
alpha: T,
a: Op<&CscMatrix<T>>,
b: Op<&CscMatrix<T>>,
spmm_kernel: F,
) -> Result<(), OperationError>
where
T: Scalar + ClosedAdd + ClosedMul + Zero + One,
F: Fn(
T,
&mut CscMatrix<T>,
T,
Op<&CscMatrix<T>>,
Op<&CscMatrix<T>>,
) -> Result<(), OperationError>,
{
use Op::{NoOp, Transpose};
// Currently we handle transposition by explicitly precomputing transposed matrices // Currently we handle transposition by explicitly precomputing transposed matrices
// and calling the operation again without transposition // and calling the operation again without transposition
let a_ref: &CscMatrix<T> = a.inner_ref(); let a_ref: &CscMatrix<T> = a.inner_ref();
@ -101,15 +154,10 @@ where
(NoOp(_), NoOp(_)) => unreachable!(), (NoOp(_), NoOp(_)) => unreachable!(),
(Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)), (Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)),
(NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())), (NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())),
(Transpose(ref a), Transpose(ref b)) => { (Transpose(ref a), Transpose(ref b)) => (Owned(a.transpose()), Owned(b.transpose())),
(Owned(a.transpose()), Owned(b.transpose()))
}
} }
}; };
spmm_kernel(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref()))
spmm_csc_prealloc(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref()))
}
}
} }
/// Solve the lower triangular system `op(L) X = B`. /// Solve the lower triangular system `op(L) X = B`.

View File

@ -1,5 +1,7 @@
use crate::csr::CsrMatrix; use crate::csr::CsrMatrix;
use crate::ops::serial::cs::{spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc}; use crate::ops::serial::cs::{
spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc, spmm_cs_prealloc_unchecked,
};
use crate::ops::serial::OperationError; use crate::ops::serial::OperationError;
use crate::ops::Op; use crate::ops::Op;
use nalgebra::{ClosedAdd, ClosedMul, DMatrixSlice, DMatrixSliceMut, Scalar}; use nalgebra::{ClosedAdd, ClosedMul, DMatrixSlice, DMatrixSliceMut, Scalar};
@ -77,15 +79,63 @@ where
{ {
assert_compatible_spmm_dims!(c, a, b); assert_compatible_spmm_dims!(c, a, b);
use Op::{NoOp, Transpose}; use Op::NoOp;
match (&a, &b) { match (&a, &b) {
(NoOp(ref a), NoOp(ref b)) => spmm_cs_prealloc(beta, &mut c.cs, alpha, &a.cs, &b.cs), (NoOp(ref a), NoOp(ref b)) => spmm_cs_prealloc(beta, &mut c.cs, alpha, &a.cs, &b.cs),
_ => { _ => spmm_csr_transposed(beta, c, alpha, a, b, spmm_csr_prealloc),
}
}
/// Faster sparse-sparse matrix multiplication, `C <- beta * C + alpha * op(A) * op(B)`.
/// This will not return an error even if the patterns don't match.
/// Should be used for situations where pattern creation immediately preceeds multiplication.
///
/// Panics if the dimensions of the matrices involved are not compatible with the expression.
pub fn spmm_csr_prealloc_unchecked<T>(
beta: T,
c: &mut CsrMatrix<T>,
alpha: T,
a: Op<&CsrMatrix<T>>,
b: Op<&CsrMatrix<T>>,
) -> Result<(), OperationError>
where
T: Scalar + ClosedAdd + ClosedMul + Zero + One,
{
assert_compatible_spmm_dims!(c, a, b);
use Op::NoOp;
match (&a, &b) {
(NoOp(ref a), NoOp(ref b)) => {
spmm_cs_prealloc_unchecked(beta, &mut c.cs, alpha, &a.cs, &b.cs)
}
_ => spmm_csr_transposed(beta, c, alpha, a, b, spmm_csr_prealloc_unchecked),
}
}
fn spmm_csr_transposed<T, F>(
beta: T,
c: &mut CsrMatrix<T>,
alpha: T,
a: Op<&CsrMatrix<T>>,
b: Op<&CsrMatrix<T>>,
spmm_kernel: F,
) -> Result<(), OperationError>
where
T: Scalar + ClosedAdd + ClosedMul + Zero + One,
F: Fn(
T,
&mut CsrMatrix<T>,
T,
Op<&CsrMatrix<T>>,
Op<&CsrMatrix<T>>,
) -> Result<(), OperationError>,
{
use Op::{NoOp, Transpose};
// Currently we handle transposition by explicitly precomputing transposed matrices // Currently we handle transposition by explicitly precomputing transposed matrices
// and calling the operation again without transposition // and calling the operation again without transposition
// TODO: At least use workspaces to allow control of allocations. Maybe
// consider implementing certain patterns (like A^T * B) explicitly
let a_ref: &CsrMatrix<T> = a.inner_ref(); let a_ref: &CsrMatrix<T> = a.inner_ref();
let b_ref: &CsrMatrix<T> = b.inner_ref(); let b_ref: &CsrMatrix<T> = b.inner_ref();
let (a, b) = { let (a, b) = {
@ -94,13 +144,8 @@ where
(NoOp(_), NoOp(_)) => unreachable!(), (NoOp(_), NoOp(_)) => unreachable!(),
(Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)), (Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)),
(NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())), (NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())),
(Transpose(ref a), Transpose(ref b)) => { (Transpose(ref a), Transpose(ref b)) => (Owned(a.transpose()), Owned(b.transpose())),
(Owned(a.transpose()), Owned(b.transpose()))
}
} }
}; };
spmm_kernel(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref()))
spmm_csr_prealloc(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref()))
}
}
} }

View File

@ -6,7 +6,8 @@ use nalgebra_sparse::csc::CscMatrix;
use nalgebra_sparse::csr::CsrMatrix; use nalgebra_sparse::csr::CsrMatrix;
use nalgebra_sparse::ops::serial::{ use nalgebra_sparse::ops::serial::{
spadd_csc_prealloc, spadd_csr_prealloc, spadd_pattern, spmm_csc_dense, spmm_csc_prealloc, spadd_csc_prealloc, spadd_csr_prealloc, spadd_pattern, spmm_csc_dense, spmm_csc_prealloc,
spmm_csr_dense, spmm_csr_pattern, spmm_csr_prealloc, spsolve_csc_lower_triangular, spmm_csc_prealloc_unchecked, spmm_csr_dense, spmm_csr_pattern, spmm_csr_prealloc,
spmm_csr_prealloc_unchecked, spsolve_csc_lower_triangular,
}; };
use nalgebra_sparse::ops::Op; use nalgebra_sparse::ops::Op;
use nalgebra_sparse::pattern::SparsityPattern; use nalgebra_sparse::pattern::SparsityPattern;
@ -543,6 +544,29 @@ proptest! {
prop_assert_eq!(&c_pattern, c_csr.pattern()); prop_assert_eq!(&c_pattern, c_csr.pattern());
} }
#[test]
fn spmm_csr_prealloc_unchecked_test(SpmmCsrArgs { c, beta, alpha, a, b }
in spmm_csr_prealloc_args_strategy()
) {
// Test that we get the expected result by comparing to an equivalent dense operation
// (here we give in the C matrix, so the sparsity pattern is essentially fixed)
let mut c_sparse = c.clone();
spmm_csr_prealloc_unchecked(beta, &mut c_sparse, alpha, a.as_ref(), b.as_ref()).unwrap();
let mut c_dense = DMatrix::from(&c);
let op_a_dense = match a {
Op::NoOp(ref a) => DMatrix::from(a),
Op::Transpose(ref a) => DMatrix::from(a).transpose(),
};
let op_b_dense = match b {
Op::NoOp(ref b) => DMatrix::from(b),
Op::Transpose(ref b) => DMatrix::from(b).transpose(),
};
c_dense = beta * c_dense + alpha * &op_a_dense * op_b_dense;
prop_assert_eq!(&DMatrix::from(&c_sparse), &c_dense);
}
#[test] #[test]
fn spmm_csr_prealloc_test(SpmmCsrArgs { c, beta, alpha, a, b } fn spmm_csr_prealloc_test(SpmmCsrArgs { c, beta, alpha, a, b }
in spmm_csr_prealloc_args_strategy() in spmm_csr_prealloc_args_strategy()
@ -705,6 +729,29 @@ proptest! {
prop_assert_eq!(&DMatrix::from(&c_sparse), &c_dense); prop_assert_eq!(&DMatrix::from(&c_sparse), &c_dense);
} }
#[test]
fn spmm_csc_prealloc_unchecked_test(SpmmCscArgs { c, beta, alpha, a, b }
in spmm_csc_prealloc_args_strategy()
) {
// Test that we get the expected result by comparing to an equivalent dense operation
// (here we give in the C matrix, so the sparsity pattern is essentially fixed)
let mut c_sparse = c.clone();
spmm_csc_prealloc_unchecked(beta, &mut c_sparse, alpha, a.as_ref(), b.as_ref()).unwrap();
let mut c_dense = DMatrix::from(&c);
let op_a_dense = match a {
Op::NoOp(ref a) => DMatrix::from(a),
Op::Transpose(ref a) => DMatrix::from(a).transpose(),
};
let op_b_dense = match b {
Op::NoOp(ref b) => DMatrix::from(b),
Op::Transpose(ref b) => DMatrix::from(b).transpose(),
};
c_dense = beta * c_dense + alpha * &op_a_dense * op_b_dense;
prop_assert_eq!(&DMatrix::from(&c_sparse), &c_dense);
}
#[test] #[test]
fn spmm_csc_prealloc_panics_on_dim_mismatch( fn spmm_csc_prealloc_panics_on_dim_mismatch(
(alpha, beta, c, a, b) (alpha, beta, c, a, b)