Improved CooMatrix proptest strategies

2020-11-18 13:54:14 +01:00 · 2020-11-18 13:54:14 +01:00 · 7260f05b07
parent 46442d6060
commit 7260f05b07
6 changed files with 312 additions and 29 deletions
--- a/nalgebra-sparse/Cargo.toml
+++ b/nalgebra-sparse/Cargo.toml
@ -7,7 +7,13 @@ edition = "2018"
 [features]
 proptest-support = ["proptest", "nalgebra/proptest"]
 # Enable to enable running some tests that take a lot of time to run
 slow-tests = []
 [dependencies]
 nalgebra = { version="0.23", path = "../" }
 num-traits = { version = "0.2", default-features = false }
 proptest = { version = "0.10", optional = true }
 [dev-dependencies]
 itertools = "0.9"
--- a/nalgebra-sparse/src/coo.rs
+++ b/nalgebra-sparse/src/coo.rs
@ -37,7 +37,7 @@ use num_traits::Zero;
 ///
 /// // TODO: Convert to CSR
 /// ```
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct CooMatrix<T> {
    nrows: usize,
    ncols: usize,
--- a/nalgebra-sparse/src/proptest.rs
+++ b/nalgebra-sparse/src/proptest.rs
@ -4,38 +4,177 @@
 use crate::coo::CooMatrix;
 use proptest::prelude::*;
-use proptest::collection::{SizeRange, vec};
+use proptest::collection::{vec, hash_map};
 use nalgebra::Scalar;
 use std::cmp::min;
 use std::iter::repeat;
 use proptest::sample::{Index};
-/// TODO
+/// A strategy for generating `nnz` triplets.
-pub fn coo<T>(
+///
-                 value_strategy: T,
+/// This strategy should generally only be used when `nnz` is close to `nrows * ncols`.
-                 rows: impl Strategy<Value=usize> + 'static,
+fn dense_triplet_strategy<T>(value_strategy: T,
-                 cols: impl Strategy<Value=usize> + 'static,
+                             nrows: usize,
-                 max_nonzeros: usize) -> BoxedStrategy<CooMatrix<T::Value>>
+                             ncols: usize,
                             nnz: usize)
                             -> impl Strategy<Value=Vec<(usize, usize, T::Value)>>
 where
    T: Strategy + Clone + 'static,
    T::Value: Scalar,
 {
-    (rows, cols, (0 ..= max_nonzeros))
+    assert!(nnz <= nrows * ncols);
-        .prop_flat_map(move |(nrows, ncols, nnz)| {
+
-            // If the numbers of rows and columns are small in comparison with the
+    // Construct a number of booleans of which exactly `nnz` are true.
-            // max nnz, it will lead to small matrices essentially always turning out to be dense.
+    let booleans: Vec<_> = repeat(true)
-            // To address this, we correct the nnz by computing the modulo with the
+        .take(nnz)
-            // maximum number of non-zeros (ignoring duplicates) we can have for
+        .chain(repeat(false))
-            // the given dimensions.
+        .take(nrows * ncols)
-            // This way we can still generate very sparse matrices for small matrices.
+        .collect();
-            let max_nnz = nrows * ncols;
+
-            let nnz = if max_nnz == 0 { 0 } else { nnz % max_nnz };
+    Just(booleans)
        // Shuffle the booleans so that they are randomly distributed
        .prop_shuffle()
        // Convert the booleans into a list of coordinate pairs
        .prop_map(move |booleans| {
            booleans
                .into_iter()
                .enumerate()
                .filter_map(|(index, is_entry)| {
                    if is_entry {
                        // Convert linear index to row/col pair
                        let i = index / ncols;
                        let j = index % ncols;
                        Some((i, j))
                    } else {
                        None
                    }
                })
                .collect::<Vec<_>>()
        })
        // Assign values to each coordinate pair in order to generate a list of triplets
        .prop_flat_map(move |coords| {
            vec![value_strategy.clone(); coords.len()]
                .prop_map(move |values| {
                    coords.clone().into_iter()
                        .zip(values)
                        .map(|((i, j), v)| {
                            (i, j, v)
                        })
                        .collect::<Vec<_>>()
                })
        })
 }
 /// A strategy for generating `nnz` triplets.
 ///
 /// This strategy should generally only be used when `nnz << nrows * ncols`. If `nnz` is too
 /// close to `nrows * ncols` it may fail due to excessive rejected samples.
 fn sparse_triplet_strategy<T>(value_strategy: T,
                             nrows: usize,
                             ncols: usize,
                             nnz: usize)
                             -> impl Strategy<Value=Vec<(usize, usize, T::Value)>>
    where
        T: Strategy + Clone + 'static,
        T::Value: Scalar,
 {
    // Have to handle the zero case: proptest doesn't like empty ranges (i.e. 0 .. 0)
    let row_index_strategy = if nrows > 0 { 0 .. nrows } else { 0 .. 1 };
    let col_index_strategy = if ncols > 0 { 0 .. ncols } else { 0 .. 1 };
-            let row_indices = vec![row_index_strategy.clone(); nnz];
+    let coord_strategy = (row_index_strategy, col_index_strategy);
-            let col_indices = vec![col_index_strategy.clone(); nnz];
+    hash_map(coord_strategy, value_strategy.clone(), nnz)
-            let values_strategy = vec![value_strategy.clone(); nnz];
+        .prop_map(|hash_map| {
-
+            let triplets: Vec<_> = hash_map
-            (Just(nrows), Just(ncols), row_indices, col_indices, values_strategy)
+                .into_iter()
-        }).prop_map(|(nrows, ncols, row_indices, col_indices, values)| {
+                .map(|((i, j), v)| (i, j, v))
-            CooMatrix::try_from_triplets(nrows, ncols, row_indices, col_indices, values)
+                .collect();
-                .expect("We should always generate valid COO data.")
+            triplets
-        }).boxed()
+        })
        // Although order in the hash map is unspecified, it's not necessarily *random*
        // - or, in particular, it does not necessarily sample the whole space of possible outcomes -
        // so we additionally shuffle the triplets
        .prop_shuffle()
 }
 /// TODO
 pub fn coo_no_duplicates<T>(
    value_strategy: T,
    rows: impl Strategy<Value=usize> + 'static,
    cols: impl Strategy<Value=usize> + 'static,
    max_nonzeros: usize) -> impl Strategy<Value=CooMatrix<T::Value>>
 where
    T: Strategy + Clone + 'static,
    T::Value: Scalar,
 {
    (rows, cols)
        .prop_flat_map(move |(nrows, ncols)| {
            let max_nonzeros = min(max_nonzeros, nrows * ncols);
            let size_range = 0 ..= max_nonzeros;
            let value_strategy = value_strategy.clone();
            size_range.prop_flat_map(move |nnz| {
                let value_strategy = value_strategy.clone();
                if nnz as f64 > 0.10 * (nrows as f64) * (ncols as f64) {
                    // If the number of nnz is sufficiently dense, then use the dense
                    // sample strategy
                    dense_triplet_strategy(value_strategy, nrows, ncols, nnz).boxed()
                } else {
                    // Otherwise, use a hash map strategy so that we can get a sparse sampling
                    // (so that complexity is rather on the order of max_nnz than nrows * ncols)
                    sparse_triplet_strategy(value_strategy, nrows, ncols, nnz).boxed()
                }
            })
            .prop_map(move |triplets| {
                let mut coo = CooMatrix::new(nrows, ncols);
                for (i, j, v) in triplets {
                    coo.push(i, j, v);
                }
                coo
            })
        })
 }
 /// TODO
 ///
 /// TODO: Write note on how this strategy only maintains the constraints on values
 /// for each triplet, but does not consider the sum of triplets
 pub fn coo_with_duplicates<T>(
                 value_strategy: T,
                 rows: impl Strategy<Value=usize> + 'static,
                 cols: impl Strategy<Value=usize> + 'static,
                 max_nonzeros: usize,
                 max_duplicates: usize)
    -> impl Strategy<Value=CooMatrix<T::Value>>
 where
    T: Strategy + Clone + 'static,
    T::Value: Scalar,
 {
    let coo_strategy = coo_no_duplicates(value_strategy.clone(), rows, cols, max_nonzeros);
    let duplicate_strategy = vec((any::<Index>(), value_strategy.clone()), 0 ..= max_duplicates);
    (coo_strategy, duplicate_strategy)
        .prop_flat_map(|(coo, duplicates)| {
            let mut triplets: Vec<(usize, usize, T::Value)> = coo.triplet_iter()
                .map(|(i, j, v)| (i, j, v.clone()))
                .collect();
            if !triplets.is_empty() {
                let duplicates_iter: Vec<_> = duplicates
                    .into_iter()
                    .map(|(idx, val)| {
                        let (i, j, _) = idx.get(&triplets);
                        (*i, *j, val)
                    })
                    .collect();
                triplets.extend(duplicates_iter);
            }
            // Make sure to shuffle so that the duplicates get mixed in with the non-duplicates
            let shuffled = Just(triplets).prop_shuffle();
            (Just(coo.nrows()), Just(coo.ncols()), shuffled)
        })
        .prop_map(move |(nrows, ncols, triplets)| {
            let mut coo = CooMatrix::new(nrows, ncols);
            for (i, j, v) in triplets {
                coo.push(i, j, v);
            }
            coo
        })
 }
--- a/nalgebra-sparse/tests/unit.rs
+++ b/nalgebra-sparse/tests/unit.rs
@ -1,4 +1,7 @@
 //! Unit tests
 #[cfg(not(feature = "proptest-support"))]
 compile_error!("Tests must be run with feature proptest-support");
 mod unit_tests;
 #[macro_use]
--- a/nalgebra-sparse/tests/unit_tests/mod.rs
+++ b/nalgebra-sparse/tests/unit_tests/mod.rs
@ -3,3 +3,4 @@ mod ops;
 mod pattern;
 mod csr;
 mod csc;
 mod proptest;
--- a/nalgebra-sparse/tests/unit_tests/proptest.rs
+++ b/nalgebra-sparse/tests/unit_tests/proptest.rs
@ -0,0 +1,134 @@
 use nalgebra_sparse::proptest::{coo_with_duplicates, coo_no_duplicates};
 use nalgebra::DMatrix;
 use proptest::prelude::*;
 use itertools::Itertools;
 use std::collections::HashSet;
 use std::iter::repeat;
 #[cfg(feature = "slow-tests")]
 use {
    proptest::test_runner::TestRunner,
    proptest::strategy::ValueTree
 };
 use std::ops::RangeInclusive;
 #[cfg(feature = "slow-tests")]
 fn generate_all_possible_matrices(value_range: RangeInclusive<i32>,
                                  rows_range: RangeInclusive<usize>,
                                  cols_range: RangeInclusive<usize>)
    -> HashSet<DMatrix<i32>>
 {
    // Enumerate all possible combinations
    let mut all_combinations = HashSet::new();
    for nrows in rows_range {
        for ncols in cols_range.clone() {
            // For the given number of rows and columns
            let n_values = nrows * ncols;
            if n_values == 0 {
                // If we have zero rows or columns, the set of matrices with the given
                // rows and columns is a single element: an empty matrix
                all_combinations.insert(DMatrix::from_row_slice(nrows, ncols, &[]));
            } else {
                // Otherwise, we need to sample all possible matrices.
                // To do this, we generate the values as the (multi) Cartesian product
                // of the value sets. For example, for a 2x2 matrices, we consider
                // all possible 4-element arrays that the matrices can take by
                // considering all elements in the cartesian product
                //  V x V x V x V
                // where V is the set of eligible values, e.g. V := -1 ..= 1
                let values_iter = repeat(value_range.clone())
                    .take(n_values)
                    .multi_cartesian_product();
                for matrix_values in values_iter {
                    all_combinations.insert(DMatrix::from_row_slice(nrows, ncols, &matrix_values));
                }
            }
        }
    }
    all_combinations
 }
 #[cfg(feature = "slow-tests")]
 #[test]
 fn coo_no_duplicates_samples_all_admissible_outputs() {
    // Note: This test basically mirrors a similar test for `matrix` in the `nalgebra` repo.
    // Test that the proptest generation covers all possible outputs for a small space of inputs
    // given enough samples.
    // We use a deterministic test runner to make the test "stable".
    let mut runner = TestRunner::deterministic();
    // This number needs to be high enough so that we with high probability sample
    // all possible cases
    let num_generated_matrices = 500000;
    let values = -1..=1;
    let rows = 0..=2;
    let cols = 0..=3;
    let strategy = coo_no_duplicates(values.clone(), rows.clone(), cols.clone(), 2 * 3);
    // Enumerate all possible combinations
    let all_combinations = generate_all_possible_matrices(values, rows, cols);
    let mut visited_combinations = HashSet::new();
    for _ in 0..num_generated_matrices {
        let tree = strategy
            .new_tree(&mut runner)
            .expect("Tree generation should not fail");
        let matrix = tree.current();
        visited_combinations.insert(DMatrix::from(&matrix));
    }
    assert_eq!(visited_combinations.len(), all_combinations.len());
    assert_eq!(visited_combinations, all_combinations, "Did not sample all possible values.");
 }
 #[cfg(feature = "slow-tests")]
 #[test]
 fn coo_with_duplicates_samples_all_admissible_outputs() {
    // This is almost the same as the test for coo_no_duplicates, except that we need
    // a different "success" criterion, since coo_with_duplicates is able to generate
    // matrices with values outside of the value constraints. See below for details.
    // We use a deterministic test runner to make the test "stable".
    let mut runner = TestRunner::deterministic();
    // This number needs to be high enough so that we with high probability sample
    // all possible cases
    let num_generated_matrices = 500000;
    let values = -1..=1;
    let rows = 0..=2;
    let cols = 0..=3;
    let strategy = coo_with_duplicates(values.clone(), rows.clone(), cols.clone(), 2 * 3, 2);
    // Enumerate all possible combinations that fit the constraints
    // (note: this is only a subset of the matrices that can be generated by
    // `coo_with_duplicates`)
    let all_combinations = generate_all_possible_matrices(values, rows, cols);
    let mut visited_combinations = HashSet::new();
    for _ in 0..num_generated_matrices {
        let tree = strategy
            .new_tree(&mut runner)
            .expect("Tree generation should not fail");
        let matrix = tree.current();
        visited_combinations.insert(DMatrix::from(&matrix));
    }
    // Here we cannot verify that the set of visited combinations is *equal* to
    // all possible outcomes with the given constraints, however the
    // strategy should be able to generate all matrices that fit the constraints.
    // In other words, we need to determine that set of all admissible matrices
    // is contained in the set of visited matrices
    assert!(all_combinations.is_subset(&visited_combinations));
 }
 #[test]
 fn coo_no_duplicates_generates_admissible_matrices() {
 }