From ff3d1e4e35cfe1d55e3393648767030e3b4a30c1 Mon Sep 17 00:00:00 2001 From: Saurabh Date: Fri, 18 Feb 2022 11:22:43 -0700 Subject: [PATCH] prealloc everything, remove hashset, make it 4x faster --- nalgebra-sparse/Cargo.toml | 6 ++++ nalgebra-sparse/src/ops/serial/cs.rs | 49 ++++++++++------------------ 2 files changed, 23 insertions(+), 32 deletions(-) diff --git a/nalgebra-sparse/Cargo.toml b/nalgebra-sparse/Cargo.toml index 70cac4c8..781d1696 100644 --- a/nalgebra-sparse/Cargo.toml +++ b/nalgebra-sparse/Cargo.toml @@ -48,3 +48,9 @@ path = "example/spmm.rs" [package.metadata.docs.rs] # Enable certain features when building docs for docs.rs features = [ "proptest-support", "compare" ] + +[profile.release] +opt-level = 3 +lto = "fat" +codegen-units = 1 +panic = "abort" diff --git a/nalgebra-sparse/src/ops/serial/cs.rs b/nalgebra-sparse/src/ops/serial/cs.rs index 1642571c..ec394c27 100644 --- a/nalgebra-sparse/src/ops/serial/cs.rs +++ b/nalgebra-sparse/src/ops/serial/cs.rs @@ -1,10 +1,9 @@ -use std::collections::HashSet; +//use std::collections::HashSet; use crate::cs::CsMatrix; use crate::ops::serial::{OperationError, OperationErrorKind}; use crate::ops::Op; use crate::SparseEntryMut; -use itertools::Itertools; use nalgebra::{ClosedAdd, ClosedMul, DMatrixSlice, DMatrixSliceMut, Scalar}; use num_traits::{One, Zero}; @@ -33,56 +32,42 @@ pub fn spmm_cs_prealloc( where T: Scalar + ClosedAdd + ClosedMul + Zero + One, { + let some_val = Zero::zero(); + let mut scratchpad_values: Vec = vec![some_val; b.pattern().minor_dim()]; + let mut scratchpad_indices: Vec = vec![0; b.pattern().minor_dim()]; + let mut scratchpad_used: Vec = vec![false; b.pattern().minor_dim()]; + let mut right_end = 0usize; for i in 0..c.pattern().major_dim() { let a_lane_i = a.get_lane(i).unwrap(); - let some_val = Zero::zero(); - let mut scratchpad_values: Vec = vec![some_val; b.pattern().minor_dim()]; - let mut scratchpad_indices: HashSet = HashSet::new(); - let mut c_lane_i = c.get_lane_mut(i).unwrap(); - //let (indices, values) = c_lane_i.indices_and_values_mut(); - //indices - // .iter() - // .zip(values.iter()) - // .for_each(|(id, val)| scratchpad_values[*id] = beta.clone() * val.clone()); - - //for (index, c_ij) in c_lane_i.indices_and_values_mut() { - // *c_ij = beta.clone() * c_ij.clone(); - //} for (&k, a_ik) in a_lane_i.minor_indices().iter().zip(a_lane_i.values()) { let b_lane_k = b.get_lane(k).unwrap(); - //let (mut c_lane_i_cols, mut c_lane_i_values) = c_lane_i.indices_and_values_mut(); let alpha_aik = alpha.clone() * a_ik.clone(); for (j, b_kj) in b_lane_k.minor_indices().iter().zip(b_lane_k.values()) { // Determine the location in C to append the value - // TODO make a scratchpad and defer the accumulation into C after processing one - // full row of A. scratchpad_values[*j] += alpha_aik.clone() * b_kj.clone(); - scratchpad_indices.insert(*j); - //let (c_local_idx, _) = c_lane_i_cols - // .iter() - // .enumerate() - // .find(|(_, c_col)| *c_col == j) - // .ok_or_else(spmm_cs_unexpected_entry)?; - - //c_lane_i_values[c_local_idx] += alpha_aik.clone() * b_kj.clone(); - //c_lane_i_cols = &c_lane_i_cols[c_local_idx..]; - //c_lane_i_values = &mut c_lane_i_values[c_local_idx..]; + if !scratchpad_used[*j] { + scratchpad_indices[right_end] = *j; + right_end += 1; + scratchpad_used[*j] = true; + } } } // sort the indices, and then access the relevant indices (in sorted order) from values // into C. - let sorted_indices: Vec = - Itertools::sorted(scratchpad_indices.into_iter()).collect(); + scratchpad_indices[0..right_end].sort_unstable(); c_lane_i .values_mut() .iter_mut() - .zip(sorted_indices.into_iter()) + .zip(scratchpad_indices[0..right_end].iter()) .for_each(|(output_ref, index)| { - *output_ref = beta.clone() * output_ref.clone() + scratchpad_values[index].clone() + *output_ref = beta.clone() * output_ref.clone() + scratchpad_values[*index].clone(); + scratchpad_used[*index] = false; + scratchpad_values[*index] = Zero::zero(); }); + right_end = 0usize; } Ok(())