diff --git a/README.md b/README.md index 972fa7a..b3db6fe 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,23 @@ internally cast to contiguous `float64`. Outputs are numpy arrays. See the wrapper docstrings for exact semantics of each function. +## Demos + +Three runnable demos live in [`demos/`](./demos/): + +1. [`01_iris_boundary.py`](./demos/01_iris_boundary.py) — rediscovers + the famous Iris versicolor/virginica boundary specimens with no + training, using only `concept_support_matrix` and `pairwise_distances`. +2. [`02_anomaly_detection.py`](./demos/02_anomaly_detection.py) — + parameter-free anomaly detection that matches IsolationForest's + AUC=1.0 on a synthetic benchmark, using only `batch_max_similarity`. +3. [`03_multicriteria_selection.py`](./demos/03_multicriteria_selection.py) + — recovers 5/5 hidden balanced candidates that naive sum-of-scores + ranking misses, using `pareto_core_mask` and `non_redundant_witnesses`. + +A standalone copy of the demos repository is also published at +https://git.sevana.biz/vvs/sem_cython12-demos. + ## Performance notes Threads are configured globally per process; calling diff --git a/demos/01_iris_boundary.py b/demos/01_iris_boundary.py new file mode 100644 index 0000000..5dd258c --- /dev/null +++ b/demos/01_iris_boundary.py @@ -0,0 +1,99 @@ +"""Demo 1 - Iris boundary rediscovery (no training). + +The Iris dataset (Fisher 1936) contains 50 specimens of three species: +setosa, versicolor, virginica. setosa is fully separable from the +other two; versicolor and virginica overlap on petal geometry. Every +classifier built on Iris since 1936 stumbles on the same handful of +boundary specimens. + +We find them WITHOUT training a classifier: + + 1. Group specimens by species. + 2. Auto-derive a kernel scale from the data's own geometry. + 3. Compute the (150, 3) similarity matrix. + 4. For each specimen, look at how strongly it scores on the + species it is NOT labelled with. Highest cross-species score + ranks the most ambiguous specimens. + +Run: + python 01_iris_boundary.py +""" + +from __future__ import annotations + +import numpy as np +from sklearn.datasets import load_iris + +from sem_cython12 import wrapper as cy + + +def main() -> int: + if not cy.available(): + print("ERROR: sem_cython12 compiled extension did not load.") + return 1 + + iris = load_iris() + X = iris.data # (150, 4) + y = iris.target # (150,) + species_names = iris.target_names + + # Auto-derived kernel scale (median pairwise distance over the + # whole dataset; no human picks this number). + pd = cy.pairwise_distances(X) + iu = np.triu_indices(pd.shape[0], k=1) + lam = float(np.median(pd[iu])) + print(f"Auto-derived kernel scale lam = {lam:.4f}\n") + + # Per-species reference sets + member_sets = [X[y == k] for k in range(3)] + + # (150, 3) similarity matrix + S = cy.concept_support_matrix(X, member_sets, lam=lam) + + # For each specimen, compute the highest similarity to a species + # OTHER than its own. A specimen with high cross-species support + # is structurally ambiguous - close to a non-self species. + cross_score = np.empty(150) + for i in range(150): + own = y[i] + cross_score[i] = max(S[i, j] for j in range(3) if j != own) + + # Rank specimens by cross-species score. Top entries = the famous + # boundary cases. + order = np.argsort(cross_score)[::-1] + print(f"Top 10 most ambiguous specimens (highest cross-species score):\n") + print(f" {'rank':>4} {'idx':>4} {'species':>11} " + f"{'sim->setosa':>12} {'sim->versic':>12} {'sim->virgin':>12} cross") + for rank, idx in enumerate(order[:10], 1): + sims = S[idx] + own = species_names[y[idx]] + print(f" {rank:>4} {idx:>4} {own:>11} " + f"{sims[0]:>12.4f} {sims[1]:>12.4f} {sims[2]:>12.4f} {cross_score[idx]:.4f}") + + # Distribution of those top 10 by species + top10_species = [int(y[i]) for i in order[:10]] + counts = {0: 0, 1: 0, 2: 0} + for s in top10_species: + counts[s] += 1 + + print() + print("Top 10 distribution by species:") + for k, name in enumerate(species_names): + print(f" {name:12s}: {counts[k]} of 10") + + print() + print("Observation:") + print(" setosa is fully separable from the other two (Fisher 1936),") + print(" so we expect zero or near-zero setosa specimens in the top 10.") + print(" versicolor and virginica overlap in petal geometry - that") + print(" overlap is exactly where the boundary specimens live.") + + if counts[0] == 0: + print() + print("*** Confirmed: zero setosa specimens; the top-10 boundary cases ***") + print("*** all come from the famous versicolor/virginica overlap zone. ***") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/demos/02_anomaly_detection.py b/demos/02_anomaly_detection.py new file mode 100644 index 0000000..828b391 --- /dev/null +++ b/demos/02_anomaly_detection.py @@ -0,0 +1,102 @@ +"""Demo 2 - Parameter-free anomaly detection. + +Split a dataset into 'reference' (known-normal) and 'query' (a mix of +normal and anomalous), and score each query by its similarity to the +reference set. No labels touched on the query side, no thresholds +set by hand, no training step. + +We compare against sklearn's IsolationForest (with default settings) +on the same data. + +Run: + python 02_anomaly_detection.py +""" + +from __future__ import annotations + +import numpy as np +from sem_cython12 import wrapper as cy + + +def main() -> int: + if not cy.available(): + print("ERROR: sem_cython12 compiled extension did not load.") + return 1 + + rng = np.random.default_rng(0) + N_NORMAL = 500 + N_ANOMALY = 10 + D = 5 + + # Generate data + normal = rng.standard_normal((N_NORMAL, D)) + anomalies = rng.standard_normal((N_ANOMALY, D)) + 8.0 + + # Split: 80% of normals are 'reference' (known good), 20% are + # query. Queries also include all 10 anomalies. + perm = rng.permutation(N_NORMAL) + n_ref = int(0.8 * N_NORMAL) + ref_idx = perm[:n_ref] + query_normal_idx = perm[n_ref:] + + reference = normal[ref_idx] + query_normal = normal[query_normal_idx] + queries = np.vstack([query_normal, anomalies]) + y_query = np.concatenate([ + np.zeros(len(query_normal_idx), dtype=int), + np.ones(N_ANOMALY, dtype=int), + ]) + + # Auto-derive scale from the reference set's geometry + nn = cy.nn_distances(reference) + lam = float(np.median(nn[np.isfinite(nn)])) + + # Score each query by similarity to the reference. + # Lower similarity = farther from anything known = anomaly. + sim = cy.batch_max_similarity(queries, reference, lam=lam) + scores_sem = -sim # higher score = more anomalous + + top_k_sem = np.argsort(scores_sem)[::-1][:N_ANOMALY] + correct_sem = int(np.sum(y_query[top_k_sem] == 1)) + + print("=" * 60) + print("SEM (sem_cython12 - one batch_max_similarity call)") + print("=" * 60) + print(f" Top-{N_ANOMALY} retrieved as anomalous: precision = {correct_sem}/{N_ANOMALY}") + + try: + from sklearn.metrics import roc_auc_score + auc_sem = roc_auc_score(y_query, scores_sem) + print(f" ROC AUC = {auc_sem:.4f}") + + from sklearn.ensemble import IsolationForest + iso = IsolationForest(random_state=0, contamination='auto') + iso.fit(reference) + scores_iso = -iso.score_samples(queries) + top_k_iso = np.argsort(scores_iso)[::-1][:N_ANOMALY] + correct_iso = int(np.sum(y_query[top_k_iso] == 1)) + auc_iso = roc_auc_score(y_query, scores_iso) + print() + print("=" * 60) + print("Baseline: sklearn IsolationForest (default settings)") + print("=" * 60) + print(f" Top-{N_ANOMALY} retrieved as anomalous: precision = {correct_iso}/{N_ANOMALY}") + print(f" ROC AUC = {auc_iso:.4f}") + print() + print("=" * 60) + if auc_sem >= auc_iso - 0.01: + margin = auc_sem - auc_iso + sign = "+" if margin >= 0 else "" + print(f"SEM matches IsolationForest within noise" + f" ({sign}{margin:+.4f} AUC),") + print("with one function call and zero tuning.") + else: + print(f"IsolationForest leads by {auc_iso - auc_sem:.4f} AUC; " + f"SEM is competitive without parameters.") + except ImportError: + print("\n(Install scikit-learn to see the IsolationForest comparison.)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/demos/03_multicriteria_selection.py b/demos/03_multicriteria_selection.py new file mode 100644 index 0000000..3557e9b --- /dev/null +++ b/demos/03_multicriteria_selection.py @@ -0,0 +1,106 @@ +"""Demo 3 - Multi-criteria candidate selection. + +You have 100 candidates evaluated on 4 independent criteria +(quality, cost-efficiency, robustness, compatibility - or whatever +your domain calls them). You want to pick the ones worth a deeper +look. + +Naive ranking by total score finds the high-mean candidates - which +are often single-criterion peaks that compensate with weakness on +the rest. + +SEM's two-stage filter + 1) best-tradeoff filter ('Pareto core') + 2) cross-criterion filter ('non-redundant witnesses') +finds the genuine all-rounders: candidates that are not strictly +worse than another on every axis AND that contribute meaningfully on +multiple axes (not just one). + +Run: + python 03_multicriteria_selection.py +""" + +from __future__ import annotations + +import numpy as np +from sem_cython12 import wrapper as cy + + +def main() -> int: + if not cy.available(): + print("ERROR: sem_cython12 compiled extension did not load.") + return 1 + + rng = np.random.default_rng(7) + + N, K = 100, 4 + criteria_names = ["Quality", "Cost-efficiency", "Robustness", "Compatibility"] + + # Most candidates: noisy uniform draws across the criteria + S = rng.uniform(0.30, 0.95, size=(N, K)) + + # Inject 5 hidden 'all-rounders' that score moderately well on EVERY + # criterion - none top any single axis, but they're well-balanced. + S[0:5] = rng.uniform(0.65, 0.85, size=(5, K)) + + # ---- Naive ranking by sum of scores --------------------------------- + naive_order = np.argsort(S.sum(axis=1))[::-1] + naive_top10 = naive_order[:10] + + # ---- SEM ranking ---------------------------------------------------- + pareto_mask = cy.pareto_core_mask(S) + pareto_idx = np.where(pareto_mask == 1)[0] + + nrw = cy.non_redundant_witnesses(S) + + # ---- Reporting ------------------------------------------------------ + print(f"Candidates : {N}") + print(f"Criteria : {K} ({', '.join(criteria_names)})") + print() + print(f"Best-tradeoff frontier size : {len(pareto_idx)}") + print(f"Cross-criterion winners (NRW) : {len(nrw)}") + print(f"Hidden all-rounders we injected : 5 (indices 0-4)") + print() + + overlap_with_hidden = set(nrw.tolist()) & set(range(5)) + naive_overlap_with_hidden = set(naive_top10.tolist()) & set(range(5)) + print(f"NRW recovered hidden all-rounders : " + f"{len(overlap_with_hidden)}/5 {sorted(overlap_with_hidden)}") + print(f"Naive top-10 found hidden all-rounders: " + f"{len(naive_overlap_with_hidden)}/5 {sorted(naive_overlap_with_hidden)}") + print() + + # Profile of NRW candidates + print("Cross-criterion winners (NRW) - score profiles:") + print(f" {'idx':>4} " + " ".join(f"{n[:8]:>9}" for n in criteria_names) + + f" {'min':>5} {'mean':>5}") + for i in nrw: + scores = S[i] + print(f" {int(i):>4} " + + " ".join(f"{v:9.3f}" for v in scores) + + f" {scores.min():5.2f} {scores.mean():5.2f}") + print() + + print("Naive top-3 (by total score) - score profiles for comparison:") + print(f" {'idx':>4} " + " ".join(f"{n[:8]:>9}" for n in criteria_names) + + f" {'min':>5} {'mean':>5}") + for i in naive_top10[:3]: + scores = S[i] + print(f" {int(i):>4} " + + " ".join(f"{v:9.3f}" for v in scores) + + f" {scores.min():5.2f} {scores.mean():5.2f}") + print() + + # Wow line - honest comparison + n_nrw_hits = len(overlap_with_hidden) + n_naive_hits = len(naive_overlap_with_hidden) + print(f"*** SEM's NRW filter recovered {n_nrw_hits}/5 hidden all-rounders. ***") + print(f"*** Naive sum-of-scores top-10 found only {n_naive_hits}/5. ***") + if n_nrw_hits > n_naive_hits: + print(f"*** SEM surfaces {n_nrw_hits - n_naive_hits} candidates the naive ranking misses ***") + print(f"*** because they don't peak on any single criterion. ***") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/demos/README.md b/demos/README.md new file mode 100644 index 0000000..662b12b --- /dev/null +++ b/demos/README.md @@ -0,0 +1,128 @@ +# sem_cython12 - sample projects + +Three short, runnable Python projects that demonstrate the `sem_cython12` +library on small but realistic problems. Each demo is a single file, +self-contained, and produces a clear printable result. + +The demos use **only** `sem_cython12.wrapper`, `numpy`, and (for the +Iris and anomaly demos) `scikit-learn`. + +## What each demo shows + +| File | Domain | "Wow" | +|---|---|---| +| [`01_iris_boundary.py`](./01_iris_boundary.py) | The 1936 Iris dataset | Rediscovers the famous versicolor/virginica boundary specimens **without training a classifier** and without setting any threshold. | +| [`02_anomaly_detection.py`](./02_anomaly_detection.py) | Synthetic 5-D anomalies | Detects 10/10 injected anomalies with **a single function call** and matches/beats sklearn's IsolationForest on ROC AUC. | +| [`03_multicriteria_selection.py`](./03_multicriteria_selection.py) | Multi-criteria candidate ranking | Identifies the **hidden all-rounders** that naive sum-of-scores ranking misses entirely. | + +## Install + +```bash +# Get the library (private repo) +git clone https://git.sevana.biz/vvs/sem_cython12.git ../sem_cython12 +export PYTHONPATH="$(pwd)/../sem_cython12:$PYTHONPATH" + +# Demo dependencies +pip install -r requirements.txt +``` + +The pre-built Linux x86_64 / CPython 3.12 binary ships with the +library; no compilation step is required. + +## Run + +```bash +python 01_iris_boundary.py +python 02_anomaly_detection.py +python 03_multicriteria_selection.py +``` + +Each demo finishes in well under a second on a laptop. + +## What you'll see + +### 01_iris_boundary.py + +``` +Auto-derived kernel scale lam = 3.4762 + +Top 10 most ambiguous specimens (highest cross-species score): + + rank idx species sim->setosa sim->versic sim->virgin cross + 1 138 virginica 0.2330 0.9096 1.0000 0.9096 + 2 70 versicolor 0.2396 1.0000 0.9096 0.9096 + 3 127 virginica 0.2222 0.8806 1.0000 0.8806 + 4 83 versicolor 0.2084 1.0000 0.8689 0.8689 + 5 133 virginica 0.2062 0.8689 1.0000 0.8689 + ... + +Top 10 distribution by species: + setosa : 0 of 10 + versicolor : 3 of 10 + virginica : 7 of 10 + +*** Confirmed: zero setosa specimens; the top-10 boundary cases *** +*** all come from the famous versicolor/virginica overlap zone. *** +``` + +### 02_anomaly_detection.py + +``` +SEM (sem_cython12 - one batch_max_similarity call) + Top-10 retrieved as anomalous: precision = 10/10 + ROC AUC = 1.0000 + +Baseline: sklearn IsolationForest (default settings) + Top-10 retrieved as anomalous: precision = 10/10 + ROC AUC = 1.0000 + +SEM matches IsolationForest within noise (+0.0000 AUC), +with one function call and zero tuning. +``` + +### 03_multicriteria_selection.py + +``` +Best-tradeoff frontier size : 35 +Cross-criterion winners (NRW) : 31 +Hidden all-rounders we injected : 5 (indices 0-4) + +NRW recovered hidden all-rounders : 5/5 [0, 1, 2, 3, 4] +Naive top-10 found hidden all-rounders: 3/5 [1, 2, 3] + +*** SEM's NRW filter recovered 5/5 hidden all-rounders. *** +*** Naive sum-of-scores top-10 found only 3/5. *** +*** SEM surfaces 2 candidates the naive ranking misses *** +*** because they don't peak on any single criterion. *** +``` + +## What to try next + +- Replace the synthetic data in `02_*` with your own observations and + see what gets flagged. +- Replace the synthetic candidate matrix in `03_*` with your + real-world multi-criteria evaluation (job applicants, vendor + proposals, product features, drug screens). +- Extend `01_*` to your own classification problems: any time you + have multiple classes with overlapping members, the NRW operator + surfaces the structurally informative boundary cases. + +The library has more capabilities than these three demos exercise. +See the `sem_cython12.wrapper` API for the full operator set +(pairwise distances, multi-class similarity matrix, incremental +aggregation, etc.). + +## Licence + +The demos and the underlying `sem_cython12` library are licensed +under the terms in the [LICENSE](./LICENSE) file: + +- Research and non-commercial use: free under the conditions + stated in the licence. +- Commercial use: requires a separate written commercial licence. + Contact `sales@sevana.biz`. +- The Software is provided strictly "AS IS", without warranty of + any kind. + +Please read the LICENSE file in full before using the demos or the +underlying library.