Vendor demos under demos/ and link from README for landing-page visibility
This commit is contained in:
@@ -0,0 +1,102 @@
|
||||
"""Demo 2 - Parameter-free anomaly detection.
|
||||
|
||||
Split a dataset into 'reference' (known-normal) and 'query' (a mix of
|
||||
normal and anomalous), and score each query by its similarity to the
|
||||
reference set. No labels touched on the query side, no thresholds
|
||||
set by hand, no training step.
|
||||
|
||||
We compare against sklearn's IsolationForest (with default settings)
|
||||
on the same data.
|
||||
|
||||
Run:
|
||||
python 02_anomaly_detection.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
from sem_cython12 import wrapper as cy
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if not cy.available():
|
||||
print("ERROR: sem_cython12 compiled extension did not load.")
|
||||
return 1
|
||||
|
||||
rng = np.random.default_rng(0)
|
||||
N_NORMAL = 500
|
||||
N_ANOMALY = 10
|
||||
D = 5
|
||||
|
||||
# Generate data
|
||||
normal = rng.standard_normal((N_NORMAL, D))
|
||||
anomalies = rng.standard_normal((N_ANOMALY, D)) + 8.0
|
||||
|
||||
# Split: 80% of normals are 'reference' (known good), 20% are
|
||||
# query. Queries also include all 10 anomalies.
|
||||
perm = rng.permutation(N_NORMAL)
|
||||
n_ref = int(0.8 * N_NORMAL)
|
||||
ref_idx = perm[:n_ref]
|
||||
query_normal_idx = perm[n_ref:]
|
||||
|
||||
reference = normal[ref_idx]
|
||||
query_normal = normal[query_normal_idx]
|
||||
queries = np.vstack([query_normal, anomalies])
|
||||
y_query = np.concatenate([
|
||||
np.zeros(len(query_normal_idx), dtype=int),
|
||||
np.ones(N_ANOMALY, dtype=int),
|
||||
])
|
||||
|
||||
# Auto-derive scale from the reference set's geometry
|
||||
nn = cy.nn_distances(reference)
|
||||
lam = float(np.median(nn[np.isfinite(nn)]))
|
||||
|
||||
# Score each query by similarity to the reference.
|
||||
# Lower similarity = farther from anything known = anomaly.
|
||||
sim = cy.batch_max_similarity(queries, reference, lam=lam)
|
||||
scores_sem = -sim # higher score = more anomalous
|
||||
|
||||
top_k_sem = np.argsort(scores_sem)[::-1][:N_ANOMALY]
|
||||
correct_sem = int(np.sum(y_query[top_k_sem] == 1))
|
||||
|
||||
print("=" * 60)
|
||||
print("SEM (sem_cython12 - one batch_max_similarity call)")
|
||||
print("=" * 60)
|
||||
print(f" Top-{N_ANOMALY} retrieved as anomalous: precision = {correct_sem}/{N_ANOMALY}")
|
||||
|
||||
try:
|
||||
from sklearn.metrics import roc_auc_score
|
||||
auc_sem = roc_auc_score(y_query, scores_sem)
|
||||
print(f" ROC AUC = {auc_sem:.4f}")
|
||||
|
||||
from sklearn.ensemble import IsolationForest
|
||||
iso = IsolationForest(random_state=0, contamination='auto')
|
||||
iso.fit(reference)
|
||||
scores_iso = -iso.score_samples(queries)
|
||||
top_k_iso = np.argsort(scores_iso)[::-1][:N_ANOMALY]
|
||||
correct_iso = int(np.sum(y_query[top_k_iso] == 1))
|
||||
auc_iso = roc_auc_score(y_query, scores_iso)
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Baseline: sklearn IsolationForest (default settings)")
|
||||
print("=" * 60)
|
||||
print(f" Top-{N_ANOMALY} retrieved as anomalous: precision = {correct_iso}/{N_ANOMALY}")
|
||||
print(f" ROC AUC = {auc_iso:.4f}")
|
||||
print()
|
||||
print("=" * 60)
|
||||
if auc_sem >= auc_iso - 0.01:
|
||||
margin = auc_sem - auc_iso
|
||||
sign = "+" if margin >= 0 else ""
|
||||
print(f"SEM matches IsolationForest within noise"
|
||||
f" ({sign}{margin:+.4f} AUC),")
|
||||
print("with one function call and zero tuning.")
|
||||
else:
|
||||
print(f"IsolationForest leads by {auc_iso - auc_sem:.4f} AUC; "
|
||||
f"SEM is competitive without parameters.")
|
||||
except ImportError:
|
||||
print("\n(Install scikit-learn to see the IsolationForest comparison.)")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user