Vendor demos under demos/ and link from README for landing-page visibility

2026-05-09 15:25:52 +01:00
parent 6597509f79
commit c886ded981
5 changed files with 452 additions and 0 deletions
@@ -0,0 +1,102 @@
+"""Demo 2 - Parameter-free anomaly detection.
+
+Split a dataset into 'reference' (known-normal) and 'query' (a mix of
+normal and anomalous), and score each query by its similarity to the
+reference set.  No labels touched on the query side, no thresholds
+set by hand, no training step.
+
+We compare against sklearn's IsolationForest (with default settings)
+on the same data.
+
+Run:
+    python 02_anomaly_detection.py
+"""
+
+from __future__ import annotations
+
+import numpy as np
+from sem_cython12 import wrapper as cy
+
+
+def main() -> int:
+    if not cy.available():
+        print("ERROR: sem_cython12 compiled extension did not load.")
+        return 1
+
+    rng = np.random.default_rng(0)
+    N_NORMAL = 500
+    N_ANOMALY = 10
+    D = 5
+
+    # Generate data
+    normal = rng.standard_normal((N_NORMAL, D))
+    anomalies = rng.standard_normal((N_ANOMALY, D)) + 8.0
+
+    # Split: 80% of normals are 'reference' (known good), 20% are
+    # query.  Queries also include all 10 anomalies.
+    perm = rng.permutation(N_NORMAL)
+    n_ref = int(0.8 * N_NORMAL)
+    ref_idx = perm[:n_ref]
+    query_normal_idx = perm[n_ref:]
+
+    reference = normal[ref_idx]
+    query_normal = normal[query_normal_idx]
+    queries = np.vstack([query_normal, anomalies])
+    y_query = np.concatenate([
+        np.zeros(len(query_normal_idx), dtype=int),
+        np.ones(N_ANOMALY, dtype=int),
+    ])
+
+    # Auto-derive scale from the reference set's geometry
+    nn = cy.nn_distances(reference)
+    lam = float(np.median(nn[np.isfinite(nn)]))
+
+    # Score each query by similarity to the reference.
+    # Lower similarity = farther from anything known = anomaly.
+    sim = cy.batch_max_similarity(queries, reference, lam=lam)
+    scores_sem = -sim                     # higher score = more anomalous
+
+    top_k_sem = np.argsort(scores_sem)[::-1][:N_ANOMALY]
+    correct_sem = int(np.sum(y_query[top_k_sem] == 1))
+
+    print("=" * 60)
+    print("SEM  (sem_cython12 - one batch_max_similarity call)")
+    print("=" * 60)
+    print(f"  Top-{N_ANOMALY} retrieved as anomalous:  precision = {correct_sem}/{N_ANOMALY}")
+
+    try:
+        from sklearn.metrics import roc_auc_score
+        auc_sem = roc_auc_score(y_query, scores_sem)
+        print(f"  ROC AUC                          = {auc_sem:.4f}")
+
+        from sklearn.ensemble import IsolationForest
+        iso = IsolationForest(random_state=0, contamination='auto')
+        iso.fit(reference)
+        scores_iso = -iso.score_samples(queries)
+        top_k_iso = np.argsort(scores_iso)[::-1][:N_ANOMALY]
+        correct_iso = int(np.sum(y_query[top_k_iso] == 1))
+        auc_iso = roc_auc_score(y_query, scores_iso)
+        print()
+        print("=" * 60)
+        print("Baseline: sklearn IsolationForest (default settings)")
+        print("=" * 60)
+        print(f"  Top-{N_ANOMALY} retrieved as anomalous:  precision = {correct_iso}/{N_ANOMALY}")
+        print(f"  ROC AUC                          = {auc_iso:.4f}")
+        print()
+        print("=" * 60)
+        if auc_sem >= auc_iso - 0.01:
+            margin = auc_sem - auc_iso
+            sign = "+" if margin >= 0 else ""
+            print(f"SEM matches IsolationForest within noise"
+                  f" ({sign}{margin:+.4f} AUC),")
+            print("with one function call and zero tuning.")
+        else:
+            print(f"IsolationForest leads by {auc_iso - auc_sem:.4f} AUC; "
+                  f"SEM is competitive without parameters.")
+    except ImportError:
+        print("\n(Install scikit-learn to see the IsolationForest comparison.)")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())