From 7260f05b07d25dcc6796577c8b9289b610ce6b53 Mon Sep 17 00:00:00 2001
From: Andreas Longva <andreas.b.longva@gmail.com>
Date: Wed, 18 Nov 2020 13:54:14 +0100
Subject: [PATCH] Improved CooMatrix proptest strategies

---
 nalgebra-sparse/Cargo.toml                   |   6 +
 nalgebra-sparse/src/coo.rs                   |   2 +-
 nalgebra-sparse/src/proptest.rs              | 193 ++++++++++++++++---
 nalgebra-sparse/tests/unit.rs                |   3 +
 nalgebra-sparse/tests/unit_tests/mod.rs      |   3 +-
 nalgebra-sparse/tests/unit_tests/proptest.rs | 134 +++++++++++++
 6 files changed, 312 insertions(+), 29 deletions(-)
diff --git a/nalgebra-sparse/Cargo.toml b/nalgebra-sparse/Cargo.toml
index bf0d3849..3a4f08c1 100644
--- a/nalgebra-sparse/Cargo.toml
+++ b/nalgebra-sparse/Cargo.toml
@@ -7,7 +7,13 @@ edition = "2018"
 [features]
 proptest-support = ["proptest", "nalgebra/proptest"]
 
+# Enable to enable running some tests that take a lot of time to run
+slow-tests = []
+
 [dependencies]
 nalgebra = { version="0.23", path = "../" }
 num-traits = { version = "0.2", default-features = false }
 proptest = { version = "0.10", optional = true }
+
+[dev-dependencies]
+itertools = "0.9"
diff --git a/nalgebra-sparse/src/coo.rs b/nalgebra-sparse/src/coo.rs
index a22f65cd..9869d502 100644
--- a/nalgebra-sparse/src/coo.rs
+++ b/nalgebra-sparse/src/coo.rs
@@ -37,7 +37,7 @@ use num_traits::Zero;
 ///
 /// // TODO: Convert to CSR
 /// ```
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct CooMatrix<T> {
     nrows: usize,
     ncols: usize,
diff --git a/nalgebra-sparse/src/proptest.rs b/nalgebra-sparse/src/proptest.rs
index 59d832e7..fae49028 100644
--- a/nalgebra-sparse/src/proptest.rs
+++ b/nalgebra-sparse/src/proptest.rs
@@ -4,38 +4,177 @@
 
 use crate::coo::CooMatrix;
 use proptest::prelude::*;
-use proptest::collection::{SizeRange, vec};
+use proptest::collection::{vec, hash_map};
 use nalgebra::Scalar;
+use std::cmp::min;
+use std::iter::repeat;
+use proptest::sample::{Index};
 
-/// TODO
-pub fn coo<T>(
-                 value_strategy: T,
-                 rows: impl Strategy<Value=usize> + 'static,
-                 cols: impl Strategy<Value=usize> + 'static,
-                 max_nonzeros: usize) -> BoxedStrategy<CooMatrix<T::Value>>
+/// A strategy for generating `nnz` triplets.
+///
+/// This strategy should generally only be used when `nnz` is close to `nrows * ncols`.
+fn dense_triplet_strategy<T>(value_strategy: T,
+                             nrows: usize,
+                             ncols: usize,
+                             nnz: usize)
+                             -> impl Strategy<Value=Vec<(usize, usize, T::Value)>>
 where
     T: Strategy + Clone + 'static,
     T::Value: Scalar,
 {
-    (rows, cols, (0 ..= max_nonzeros))
-        .prop_flat_map(move |(nrows, ncols, nnz)| {
-            // If the numbers of rows and columns are small in comparison with the
-            // max nnz, it will lead to small matrices essentially always turning out to be dense.
-            // To address this, we correct the nnz by computing the modulo with the
-            // maximum number of non-zeros (ignoring duplicates) we can have for
-            // the given dimensions.
-            // This way we can still generate very sparse matrices for small matrices.
-            let max_nnz = nrows * ncols;
-            let nnz = if max_nnz == 0 { 0 } else { nnz % max_nnz };
-            let row_index_strategy = if nrows > 0 { 0 .. nrows } else { 0 .. 1 };
-            let col_index_strategy = if ncols > 0 { 0 .. ncols } else { 0 .. 1 };
-            let row_indices = vec![row_index_strategy.clone(); nnz];
-            let col_indices = vec![col_index_strategy.clone(); nnz];
-            let values_strategy = vec![value_strategy.clone(); nnz];
+    assert!(nnz <= nrows * ncols);
 
-            (Just(nrows), Just(ncols), row_indices, col_indices, values_strategy)
-        }).prop_map(|(nrows, ncols, row_indices, col_indices, values)| {
-            CooMatrix::try_from_triplets(nrows, ncols, row_indices, col_indices, values)
-                .expect("We should always generate valid COO data.")
-        }).boxed()
+    // Construct a number of booleans of which exactly `nnz` are true.
+    let booleans: Vec<_> = repeat(true)
+        .take(nnz)
+        .chain(repeat(false))
+        .take(nrows * ncols)
+        .collect();
+
+    Just(booleans)
+        // Shuffle the booleans so that they are randomly distributed
+        .prop_shuffle()
+        // Convert the booleans into a list of coordinate pairs
+        .prop_map(move |booleans| {
+            booleans
+                .into_iter()
+                .enumerate()
+                .filter_map(|(index, is_entry)| {
+                    if is_entry {
+                        // Convert linear index to row/col pair
+                        let i = index / ncols;
+                        let j = index % ncols;
+                        Some((i, j))
+                    } else {
+                        None
+                    }
+                })
+                .collect::<Vec<_>>()
+        })
+        // Assign values to each coordinate pair in order to generate a list of triplets
+        .prop_flat_map(move |coords| {
+            vec![value_strategy.clone(); coords.len()]
+                .prop_map(move |values| {
+                    coords.clone().into_iter()
+                        .zip(values)
+                        .map(|((i, j), v)| {
+                            (i, j, v)
+                        })
+                        .collect::<Vec<_>>()
+                })
+        })
+}
+
+/// A strategy for generating `nnz` triplets.
+///
+/// This strategy should generally only be used when `nnz << nrows * ncols`. If `nnz` is too
+/// close to `nrows * ncols` it may fail due to excessive rejected samples.
+fn sparse_triplet_strategy<T>(value_strategy: T,
+                             nrows: usize,
+                             ncols: usize,
+                             nnz: usize)
+                             -> impl Strategy<Value=Vec<(usize, usize, T::Value)>>
+    where
+        T: Strategy + Clone + 'static,
+        T::Value: Scalar,
+{
+    // Have to handle the zero case: proptest doesn't like empty ranges (i.e. 0 .. 0)
+    let row_index_strategy = if nrows > 0 { 0 .. nrows } else { 0 .. 1 };
+    let col_index_strategy = if ncols > 0 { 0 .. ncols } else { 0 .. 1 };
+    let coord_strategy = (row_index_strategy, col_index_strategy);
+    hash_map(coord_strategy, value_strategy.clone(), nnz)
+        .prop_map(|hash_map| {
+            let triplets: Vec<_> = hash_map
+                .into_iter()
+                .map(|((i, j), v)| (i, j, v))
+                .collect();
+            triplets
+        })
+        // Although order in the hash map is unspecified, it's not necessarily *random*
+        // - or, in particular, it does not necessarily sample the whole space of possible outcomes -
+        // so we additionally shuffle the triplets
+        .prop_shuffle()
+}
+
+/// TODO
+pub fn coo_no_duplicates<T>(
+    value_strategy: T,
+    rows: impl Strategy<Value=usize> + 'static,
+    cols: impl Strategy<Value=usize> + 'static,
+    max_nonzeros: usize) -> impl Strategy<Value=CooMatrix<T::Value>>
+where
+    T: Strategy + Clone + 'static,
+    T::Value: Scalar,
+{
+    (rows, cols)
+        .prop_flat_map(move |(nrows, ncols)| {
+            let max_nonzeros = min(max_nonzeros, nrows * ncols);
+            let size_range = 0 ..= max_nonzeros;
+            let value_strategy = value_strategy.clone();
+
+            size_range.prop_flat_map(move |nnz| {
+                let value_strategy = value_strategy.clone();
+                if nnz as f64 > 0.10 * (nrows as f64) * (ncols as f64) {
+                    // If the number of nnz is sufficiently dense, then use the dense
+                    // sample strategy
+                    dense_triplet_strategy(value_strategy, nrows, ncols, nnz).boxed()
+                } else {
+                    // Otherwise, use a hash map strategy so that we can get a sparse sampling
+                    // (so that complexity is rather on the order of max_nnz than nrows * ncols)
+                    sparse_triplet_strategy(value_strategy, nrows, ncols, nnz).boxed()
+                }
+            })
+            .prop_map(move |triplets| {
+                let mut coo = CooMatrix::new(nrows, ncols);
+                for (i, j, v) in triplets {
+                    coo.push(i, j, v);
+                }
+                coo
+            })
+        })
+}
+
+/// TODO
+///
+/// TODO: Write note on how this strategy only maintains the constraints on values
+/// for each triplet, but does not consider the sum of triplets
+pub fn coo_with_duplicates<T>(
+                 value_strategy: T,
+                 rows: impl Strategy<Value=usize> + 'static,
+                 cols: impl Strategy<Value=usize> + 'static,
+                 max_nonzeros: usize,
+                 max_duplicates: usize)
+    -> impl Strategy<Value=CooMatrix<T::Value>>
+where
+    T: Strategy + Clone + 'static,
+    T::Value: Scalar,
+{
+    let coo_strategy = coo_no_duplicates(value_strategy.clone(), rows, cols, max_nonzeros);
+    let duplicate_strategy = vec((any::<Index>(), value_strategy.clone()), 0 ..= max_duplicates);
+    (coo_strategy, duplicate_strategy)
+        .prop_flat_map(|(coo, duplicates)| {
+            let mut triplets: Vec<(usize, usize, T::Value)> = coo.triplet_iter()
+                .map(|(i, j, v)| (i, j, v.clone()))
+                .collect();
+            if !triplets.is_empty() {
+                let duplicates_iter: Vec<_> = duplicates
+                    .into_iter()
+                    .map(|(idx, val)| {
+                        let (i, j, _) = idx.get(&triplets);
+                        (*i, *j, val)
+                    })
+                    .collect();
+                triplets.extend(duplicates_iter);
+            }
+            // Make sure to shuffle so that the duplicates get mixed in with the non-duplicates
+            let shuffled = Just(triplets).prop_shuffle();
+            (Just(coo.nrows()), Just(coo.ncols()), shuffled)
+        })
+        .prop_map(move |(nrows, ncols, triplets)| {
+            let mut coo = CooMatrix::new(nrows, ncols);
+            for (i, j, v) in triplets {
+                coo.push(i, j, v);
+            }
+            coo
+        })
 }
\ No newline at end of file
diff --git a/nalgebra-sparse/tests/unit.rs b/nalgebra-sparse/tests/unit.rs
index ee44e73b..b7badffd 100644
--- a/nalgebra-sparse/tests/unit.rs
+++ b/nalgebra-sparse/tests/unit.rs
@@ -1,4 +1,7 @@
 //! Unit tests
+#[cfg(not(feature = "proptest-support"))]
+compile_error!("Tests must be run with feature proptest-support");
+
 mod unit_tests;
 
 #[macro_use]
diff --git a/nalgebra-sparse/tests/unit_tests/mod.rs b/nalgebra-sparse/tests/unit_tests/mod.rs
index 733357c9..f5b6d935 100644
--- a/nalgebra-sparse/tests/unit_tests/mod.rs
+++ b/nalgebra-sparse/tests/unit_tests/mod.rs
@@ -2,4 +2,5 @@ mod coo;
 mod ops;
 mod pattern;
 mod csr;
-mod csc;
\ No newline at end of file
+mod csc;
+mod proptest;
\ No newline at end of file
diff --git a/nalgebra-sparse/tests/unit_tests/proptest.rs b/nalgebra-sparse/tests/unit_tests/proptest.rs
index e69de29b..23afa741 100644
--- a/nalgebra-sparse/tests/unit_tests/proptest.rs
+++ b/nalgebra-sparse/tests/unit_tests/proptest.rs
@@ -0,0 +1,134 @@
+use nalgebra_sparse::proptest::{coo_with_duplicates, coo_no_duplicates};
+use nalgebra::DMatrix;
+
+use proptest::prelude::*;
+use itertools::Itertools;
+
+use std::collections::HashSet;
+use std::iter::repeat;
+
+#[cfg(feature = "slow-tests")]
+use {
+    proptest::test_runner::TestRunner,
+    proptest::strategy::ValueTree
+};
+use std::ops::RangeInclusive;
+
+#[cfg(feature = "slow-tests")]
+fn generate_all_possible_matrices(value_range: RangeInclusive<i32>,
+                                  rows_range: RangeInclusive<usize>,
+                                  cols_range: RangeInclusive<usize>)
+    -> HashSet<DMatrix<i32>>
+{
+    // Enumerate all possible combinations
+    let mut all_combinations = HashSet::new();
+    for nrows in rows_range {
+        for ncols in cols_range.clone() {
+            // For the given number of rows and columns
+            let n_values = nrows * ncols;
+
+            if n_values == 0 {
+                // If we have zero rows or columns, the set of matrices with the given
+                // rows and columns is a single element: an empty matrix
+                all_combinations.insert(DMatrix::from_row_slice(nrows, ncols, &[]));
+            } else {
+                // Otherwise, we need to sample all possible matrices.
+                // To do this, we generate the values as the (multi) Cartesian product
+                // of the value sets. For example, for a 2x2 matrices, we consider
+                // all possible 4-element arrays that the matrices can take by
+                // considering all elements in the cartesian product
+                //  V x V x V x V
+                // where V is the set of eligible values, e.g. V := -1 ..= 1
+                let values_iter = repeat(value_range.clone())
+                    .take(n_values)
+                    .multi_cartesian_product();
+                for matrix_values in values_iter {
+                    all_combinations.insert(DMatrix::from_row_slice(nrows, ncols, &matrix_values));
+                }
+            }
+        }
+    }
+    all_combinations
+}
+
+#[cfg(feature = "slow-tests")]
+#[test]
+fn coo_no_duplicates_samples_all_admissible_outputs() {
+    // Note: This test basically mirrors a similar test for `matrix` in the `nalgebra` repo.
+
+    // Test that the proptest generation covers all possible outputs for a small space of inputs
+    // given enough samples.
+
+    // We use a deterministic test runner to make the test "stable".
+    let mut runner = TestRunner::deterministic();
+
+    // This number needs to be high enough so that we with high probability sample
+    // all possible cases
+    let num_generated_matrices = 500000;
+
+    let values = -1..=1;
+    let rows = 0..=2;
+    let cols = 0..=3;
+    let strategy = coo_no_duplicates(values.clone(), rows.clone(), cols.clone(), 2 * 3);
+
+    // Enumerate all possible combinations
+    let all_combinations = generate_all_possible_matrices(values, rows, cols);
+
+    let mut visited_combinations = HashSet::new();
+    for _ in 0..num_generated_matrices {
+        let tree = strategy
+            .new_tree(&mut runner)
+            .expect("Tree generation should not fail");
+        let matrix = tree.current();
+        visited_combinations.insert(DMatrix::from(&matrix));
+    }
+
+    assert_eq!(visited_combinations.len(), all_combinations.len());
+    assert_eq!(visited_combinations, all_combinations, "Did not sample all possible values.");
+}
+
+#[cfg(feature = "slow-tests")]
+#[test]
+fn coo_with_duplicates_samples_all_admissible_outputs() {
+    // This is almost the same as the test for coo_no_duplicates, except that we need
+    // a different "success" criterion, since coo_with_duplicates is able to generate
+    // matrices with values outside of the value constraints. See below for details.
+
+    // We use a deterministic test runner to make the test "stable".
+    let mut runner = TestRunner::deterministic();
+
+    // This number needs to be high enough so that we with high probability sample
+    // all possible cases
+    let num_generated_matrices = 500000;
+
+    let values = -1..=1;
+    let rows = 0..=2;
+    let cols = 0..=3;
+    let strategy = coo_with_duplicates(values.clone(), rows.clone(), cols.clone(), 2 * 3, 2);
+
+    // Enumerate all possible combinations that fit the constraints
+    // (note: this is only a subset of the matrices that can be generated by
+    // `coo_with_duplicates`)
+    let all_combinations = generate_all_possible_matrices(values, rows, cols);
+
+    let mut visited_combinations = HashSet::new();
+    for _ in 0..num_generated_matrices {
+        let tree = strategy
+            .new_tree(&mut runner)
+            .expect("Tree generation should not fail");
+        let matrix = tree.current();
+        visited_combinations.insert(DMatrix::from(&matrix));
+    }
+
+    // Here we cannot verify that the set of visited combinations is *equal* to
+    // all possible outcomes with the given constraints, however the
+    // strategy should be able to generate all matrices that fit the constraints.
+    // In other words, we need to determine that set of all admissible matrices
+    // is contained in the set of visited matrices
+    assert!(all_combinations.is_subset(&visited_combinations));
+}
+
+#[test]
+fn coo_no_duplicates_generates_admissible_matrices() {
+
+}
\ No newline at end of file