Initial commit: three sem_cython12 demos (Iris boundary, anomaly detection, multi-criteria selection)
This commit is contained in:
@@ -0,0 +1,99 @@
|
||||
"""Demo 1 - Iris boundary rediscovery (no training).
|
||||
|
||||
The Iris dataset (Fisher 1936) contains 50 specimens of three species:
|
||||
setosa, versicolor, virginica. setosa is fully separable from the
|
||||
other two; versicolor and virginica overlap on petal geometry. Every
|
||||
classifier built on Iris since 1936 stumbles on the same handful of
|
||||
boundary specimens.
|
||||
|
||||
We find them WITHOUT training a classifier:
|
||||
|
||||
1. Group specimens by species.
|
||||
2. Auto-derive a kernel scale from the data's own geometry.
|
||||
3. Compute the (150, 3) similarity matrix.
|
||||
4. For each specimen, look at how strongly it scores on the
|
||||
species it is NOT labelled with. Highest cross-species score
|
||||
ranks the most ambiguous specimens.
|
||||
|
||||
Run:
|
||||
python 01_iris_boundary.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_iris
|
||||
|
||||
from sem_cython12 import wrapper as cy
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if not cy.available():
|
||||
print("ERROR: sem_cython12 compiled extension did not load.")
|
||||
return 1
|
||||
|
||||
iris = load_iris()
|
||||
X = iris.data # (150, 4)
|
||||
y = iris.target # (150,)
|
||||
species_names = iris.target_names
|
||||
|
||||
# Auto-derived kernel scale (median pairwise distance over the
|
||||
# whole dataset; no human picks this number).
|
||||
pd = cy.pairwise_distances(X)
|
||||
iu = np.triu_indices(pd.shape[0], k=1)
|
||||
lam = float(np.median(pd[iu]))
|
||||
print(f"Auto-derived kernel scale lam = {lam:.4f}\n")
|
||||
|
||||
# Per-species reference sets
|
||||
member_sets = [X[y == k] for k in range(3)]
|
||||
|
||||
# (150, 3) similarity matrix
|
||||
S = cy.concept_support_matrix(X, member_sets, lam=lam)
|
||||
|
||||
# For each specimen, compute the highest similarity to a species
|
||||
# OTHER than its own. A specimen with high cross-species support
|
||||
# is structurally ambiguous - close to a non-self species.
|
||||
cross_score = np.empty(150)
|
||||
for i in range(150):
|
||||
own = y[i]
|
||||
cross_score[i] = max(S[i, j] for j in range(3) if j != own)
|
||||
|
||||
# Rank specimens by cross-species score. Top entries = the famous
|
||||
# boundary cases.
|
||||
order = np.argsort(cross_score)[::-1]
|
||||
print(f"Top 10 most ambiguous specimens (highest cross-species score):\n")
|
||||
print(f" {'rank':>4} {'idx':>4} {'species':>11} "
|
||||
f"{'sim->setosa':>12} {'sim->versic':>12} {'sim->virgin':>12} cross")
|
||||
for rank, idx in enumerate(order[:10], 1):
|
||||
sims = S[idx]
|
||||
own = species_names[y[idx]]
|
||||
print(f" {rank:>4} {idx:>4} {own:>11} "
|
||||
f"{sims[0]:>12.4f} {sims[1]:>12.4f} {sims[2]:>12.4f} {cross_score[idx]:.4f}")
|
||||
|
||||
# Distribution of those top 10 by species
|
||||
top10_species = [int(y[i]) for i in order[:10]]
|
||||
counts = {0: 0, 1: 0, 2: 0}
|
||||
for s in top10_species:
|
||||
counts[s] += 1
|
||||
|
||||
print()
|
||||
print("Top 10 distribution by species:")
|
||||
for k, name in enumerate(species_names):
|
||||
print(f" {name:12s}: {counts[k]} of 10")
|
||||
|
||||
print()
|
||||
print("Observation:")
|
||||
print(" setosa is fully separable from the other two (Fisher 1936),")
|
||||
print(" so we expect zero or near-zero setosa specimens in the top 10.")
|
||||
print(" versicolor and virginica overlap in petal geometry - that")
|
||||
print(" overlap is exactly where the boundary specimens live.")
|
||||
|
||||
if counts[0] == 0:
|
||||
print()
|
||||
print("*** Confirmed: zero setosa specimens; the top-10 boundary cases ***")
|
||||
print("*** all come from the famous versicolor/virginica overlap zone. ***")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,102 @@
|
||||
"""Demo 2 - Parameter-free anomaly detection.
|
||||
|
||||
Split a dataset into 'reference' (known-normal) and 'query' (a mix of
|
||||
normal and anomalous), and score each query by its similarity to the
|
||||
reference set. No labels touched on the query side, no thresholds
|
||||
set by hand, no training step.
|
||||
|
||||
We compare against sklearn's IsolationForest (with default settings)
|
||||
on the same data.
|
||||
|
||||
Run:
|
||||
python 02_anomaly_detection.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
from sem_cython12 import wrapper as cy
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if not cy.available():
|
||||
print("ERROR: sem_cython12 compiled extension did not load.")
|
||||
return 1
|
||||
|
||||
rng = np.random.default_rng(0)
|
||||
N_NORMAL = 500
|
||||
N_ANOMALY = 10
|
||||
D = 5
|
||||
|
||||
# Generate data
|
||||
normal = rng.standard_normal((N_NORMAL, D))
|
||||
anomalies = rng.standard_normal((N_ANOMALY, D)) + 8.0
|
||||
|
||||
# Split: 80% of normals are 'reference' (known good), 20% are
|
||||
# query. Queries also include all 10 anomalies.
|
||||
perm = rng.permutation(N_NORMAL)
|
||||
n_ref = int(0.8 * N_NORMAL)
|
||||
ref_idx = perm[:n_ref]
|
||||
query_normal_idx = perm[n_ref:]
|
||||
|
||||
reference = normal[ref_idx]
|
||||
query_normal = normal[query_normal_idx]
|
||||
queries = np.vstack([query_normal, anomalies])
|
||||
y_query = np.concatenate([
|
||||
np.zeros(len(query_normal_idx), dtype=int),
|
||||
np.ones(N_ANOMALY, dtype=int),
|
||||
])
|
||||
|
||||
# Auto-derive scale from the reference set's geometry
|
||||
nn = cy.nn_distances(reference)
|
||||
lam = float(np.median(nn[np.isfinite(nn)]))
|
||||
|
||||
# Score each query by similarity to the reference.
|
||||
# Lower similarity = farther from anything known = anomaly.
|
||||
sim = cy.batch_max_similarity(queries, reference, lam=lam)
|
||||
scores_sem = -sim # higher score = more anomalous
|
||||
|
||||
top_k_sem = np.argsort(scores_sem)[::-1][:N_ANOMALY]
|
||||
correct_sem = int(np.sum(y_query[top_k_sem] == 1))
|
||||
|
||||
print("=" * 60)
|
||||
print("SEM (sem_cython12 - one batch_max_similarity call)")
|
||||
print("=" * 60)
|
||||
print(f" Top-{N_ANOMALY} retrieved as anomalous: precision = {correct_sem}/{N_ANOMALY}")
|
||||
|
||||
try:
|
||||
from sklearn.metrics import roc_auc_score
|
||||
auc_sem = roc_auc_score(y_query, scores_sem)
|
||||
print(f" ROC AUC = {auc_sem:.4f}")
|
||||
|
||||
from sklearn.ensemble import IsolationForest
|
||||
iso = IsolationForest(random_state=0, contamination='auto')
|
||||
iso.fit(reference)
|
||||
scores_iso = -iso.score_samples(queries)
|
||||
top_k_iso = np.argsort(scores_iso)[::-1][:N_ANOMALY]
|
||||
correct_iso = int(np.sum(y_query[top_k_iso] == 1))
|
||||
auc_iso = roc_auc_score(y_query, scores_iso)
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Baseline: sklearn IsolationForest (default settings)")
|
||||
print("=" * 60)
|
||||
print(f" Top-{N_ANOMALY} retrieved as anomalous: precision = {correct_iso}/{N_ANOMALY}")
|
||||
print(f" ROC AUC = {auc_iso:.4f}")
|
||||
print()
|
||||
print("=" * 60)
|
||||
if auc_sem >= auc_iso - 0.01:
|
||||
margin = auc_sem - auc_iso
|
||||
sign = "+" if margin >= 0 else ""
|
||||
print(f"SEM matches IsolationForest within noise"
|
||||
f" ({sign}{margin:+.4f} AUC),")
|
||||
print("with one function call and zero tuning.")
|
||||
else:
|
||||
print(f"IsolationForest leads by {auc_iso - auc_sem:.4f} AUC; "
|
||||
f"SEM is competitive without parameters.")
|
||||
except ImportError:
|
||||
print("\n(Install scikit-learn to see the IsolationForest comparison.)")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,106 @@
|
||||
"""Demo 3 - Multi-criteria candidate selection.
|
||||
|
||||
You have 100 candidates evaluated on 4 independent criteria
|
||||
(quality, cost-efficiency, robustness, compatibility - or whatever
|
||||
your domain calls them). You want to pick the ones worth a deeper
|
||||
look.
|
||||
|
||||
Naive ranking by total score finds the high-mean candidates - which
|
||||
are often single-criterion peaks that compensate with weakness on
|
||||
the rest.
|
||||
|
||||
SEM's two-stage filter
|
||||
1) best-tradeoff filter ('Pareto core')
|
||||
2) cross-criterion filter ('non-redundant witnesses')
|
||||
finds the genuine all-rounders: candidates that are not strictly
|
||||
worse than another on every axis AND that contribute meaningfully on
|
||||
multiple axes (not just one).
|
||||
|
||||
Run:
|
||||
python 03_multicriteria_selection.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
from sem_cython12 import wrapper as cy
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if not cy.available():
|
||||
print("ERROR: sem_cython12 compiled extension did not load.")
|
||||
return 1
|
||||
|
||||
rng = np.random.default_rng(7)
|
||||
|
||||
N, K = 100, 4
|
||||
criteria_names = ["Quality", "Cost-efficiency", "Robustness", "Compatibility"]
|
||||
|
||||
# Most candidates: noisy uniform draws across the criteria
|
||||
S = rng.uniform(0.30, 0.95, size=(N, K))
|
||||
|
||||
# Inject 5 hidden 'all-rounders' that score moderately well on EVERY
|
||||
# criterion - none top any single axis, but they're well-balanced.
|
||||
S[0:5] = rng.uniform(0.65, 0.85, size=(5, K))
|
||||
|
||||
# ---- Naive ranking by sum of scores ---------------------------------
|
||||
naive_order = np.argsort(S.sum(axis=1))[::-1]
|
||||
naive_top10 = naive_order[:10]
|
||||
|
||||
# ---- SEM ranking ----------------------------------------------------
|
||||
pareto_mask = cy.pareto_core_mask(S)
|
||||
pareto_idx = np.where(pareto_mask == 1)[0]
|
||||
|
||||
nrw = cy.non_redundant_witnesses(S)
|
||||
|
||||
# ---- Reporting ------------------------------------------------------
|
||||
print(f"Candidates : {N}")
|
||||
print(f"Criteria : {K} ({', '.join(criteria_names)})")
|
||||
print()
|
||||
print(f"Best-tradeoff frontier size : {len(pareto_idx)}")
|
||||
print(f"Cross-criterion winners (NRW) : {len(nrw)}")
|
||||
print(f"Hidden all-rounders we injected : 5 (indices 0-4)")
|
||||
print()
|
||||
|
||||
overlap_with_hidden = set(nrw.tolist()) & set(range(5))
|
||||
naive_overlap_with_hidden = set(naive_top10.tolist()) & set(range(5))
|
||||
print(f"NRW recovered hidden all-rounders : "
|
||||
f"{len(overlap_with_hidden)}/5 {sorted(overlap_with_hidden)}")
|
||||
print(f"Naive top-10 found hidden all-rounders: "
|
||||
f"{len(naive_overlap_with_hidden)}/5 {sorted(naive_overlap_with_hidden)}")
|
||||
print()
|
||||
|
||||
# Profile of NRW candidates
|
||||
print("Cross-criterion winners (NRW) - score profiles:")
|
||||
print(f" {'idx':>4} " + " ".join(f"{n[:8]:>9}" for n in criteria_names) +
|
||||
f" {'min':>5} {'mean':>5}")
|
||||
for i in nrw:
|
||||
scores = S[i]
|
||||
print(f" {int(i):>4} " +
|
||||
" ".join(f"{v:9.3f}" for v in scores) +
|
||||
f" {scores.min():5.2f} {scores.mean():5.2f}")
|
||||
print()
|
||||
|
||||
print("Naive top-3 (by total score) - score profiles for comparison:")
|
||||
print(f" {'idx':>4} " + " ".join(f"{n[:8]:>9}" for n in criteria_names) +
|
||||
f" {'min':>5} {'mean':>5}")
|
||||
for i in naive_top10[:3]:
|
||||
scores = S[i]
|
||||
print(f" {int(i):>4} " +
|
||||
" ".join(f"{v:9.3f}" for v in scores) +
|
||||
f" {scores.min():5.2f} {scores.mean():5.2f}")
|
||||
print()
|
||||
|
||||
# Wow line - honest comparison
|
||||
n_nrw_hits = len(overlap_with_hidden)
|
||||
n_naive_hits = len(naive_overlap_with_hidden)
|
||||
print(f"*** SEM's NRW filter recovered {n_nrw_hits}/5 hidden all-rounders. ***")
|
||||
print(f"*** Naive sum-of-scores top-10 found only {n_naive_hits}/5. ***")
|
||||
if n_nrw_hits > n_naive_hits:
|
||||
print(f"*** SEM surfaces {n_nrw_hits - n_naive_hits} candidates the naive ranking misses ***")
|
||||
print(f"*** because they don't peak on any single criterion. ***")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,228 @@
|
||||
sem_cython12 - LICENCE
|
||||
|
||||
Copyright (c) 2026 Sevana / Valeri Sitnikov. All rights reserved.
|
||||
|
||||
This document grants limited rights to use the Software (defined
|
||||
below) under specific conditions. By downloading, installing, or
|
||||
using the Software you accept the terms set out below. If you do
|
||||
not accept these terms, do not use the Software.
|
||||
|
||||
================================================================
|
||||
1. DEFINITIONS
|
||||
================================================================
|
||||
|
||||
"Software" means the sem_cython12 library, including all source
|
||||
code, compiled binaries, documentation, examples, configuration
|
||||
files, and any other materials distributed as part of this
|
||||
repository, together with any modifications, patches, or
|
||||
derivative works thereof.
|
||||
|
||||
"Licensor" means Sevana / Valeri Sitnikov, the entity that owns
|
||||
the copyright to the Software.
|
||||
|
||||
"You" / "Licensee" means the individual or legal entity exercising
|
||||
rights under this licence.
|
||||
|
||||
"Research Use" means use of the Software exclusively for one or
|
||||
more of the following purposes:
|
||||
|
||||
(a) academic research conducted at a degree-granting institution
|
||||
and not directly funded by a commercial sponsor for the
|
||||
purpose of producing a commercial product;
|
||||
|
||||
(b) education, including teaching, coursework, theses, and
|
||||
student projects;
|
||||
|
||||
(c) personal study, self-education, and individual experiments
|
||||
that do not generate revenue;
|
||||
|
||||
(d) evaluation of the Software for the purpose of deciding
|
||||
whether to obtain a commercial licence (limited to a
|
||||
reasonable evaluation period of no more than ninety (90)
|
||||
days);
|
||||
|
||||
(e) non-profit scientific work whose results are published
|
||||
openly and from which the user does not derive financial
|
||||
gain.
|
||||
|
||||
"Commercial Use" means any use of the Software that is not
|
||||
Research Use, including but not limited to:
|
||||
|
||||
(a) integration of the Software, in whole or in part, into a
|
||||
product or service offered to third parties for
|
||||
compensation;
|
||||
|
||||
(b) deployment of the Software within an enterprise to support
|
||||
a revenue-generating activity, regardless of whether the
|
||||
Software itself is sold;
|
||||
|
||||
(c) use of the Software in consulting, advisory, or
|
||||
professional-services engagements where compensation is
|
||||
received;
|
||||
|
||||
(d) any use whose direct or indirect purpose is to derive
|
||||
financial benefit, competitive advantage, or other
|
||||
commercial value;
|
||||
|
||||
(e) use by any for-profit organisation in the course of its
|
||||
ordinary business operations.
|
||||
|
||||
================================================================
|
||||
2. GRANT FOR RESEARCH USE
|
||||
================================================================
|
||||
|
||||
Subject to your compliance with this licence, the Licensor grants
|
||||
you a non-exclusive, non-transferable, non-sublicensable,
|
||||
royalty-free, worldwide right to use, copy, modify, and run the
|
||||
Software solely for Research Use.
|
||||
|
||||
You may share copies of the Software, including modifications,
|
||||
with other parties exercising Research Use, provided that:
|
||||
|
||||
(a) this licence file accompanies every copy you distribute;
|
||||
|
||||
(b) you do not represent the Software, or any derivative work,
|
||||
as your own original creation;
|
||||
|
||||
(c) you do not sublicense the Software under terms more
|
||||
permissive than this licence.
|
||||
|
||||
================================================================
|
||||
3. COMMERCIAL USE
|
||||
================================================================
|
||||
|
||||
Commercial Use of the Software is NOT granted under this licence.
|
||||
Any Commercial Use without a separate written commercial licence
|
||||
from the Licensor is prohibited and constitutes a material breach
|
||||
of this licence and an infringement of the Licensor's rights.
|
||||
|
||||
To obtain a commercial licence, contact the Licensor at:
|
||||
|
||||
sales@sevana.biz
|
||||
|
||||
The Licensor will provide commercial licensing terms upon request
|
||||
and reserves the right to grant or decline a commercial licence
|
||||
at its sole discretion.
|
||||
|
||||
================================================================
|
||||
4. NO WARRANTY
|
||||
================================================================
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, ACCURACY,
|
||||
TITLE, AND NON-INFRINGEMENT.
|
||||
|
||||
THE LICENSOR DOES NOT WARRANT THAT:
|
||||
|
||||
(a) the Software will operate uninterrupted or error-free;
|
||||
|
||||
(b) the Software will be free of defects, viruses, or other
|
||||
harmful components;
|
||||
|
||||
(c) the Software will meet the Licensee's requirements;
|
||||
|
||||
(d) any results obtained from the Software will be accurate,
|
||||
reliable, or fit for any particular purpose.
|
||||
|
||||
You assume all risk associated with installation, configuration,
|
||||
operation, and use of the Software. Any decisions or actions
|
||||
taken in reliance on the Software are made at your own risk.
|
||||
|
||||
================================================================
|
||||
5. LIMITATION OF LIABILITY
|
||||
================================================================
|
||||
|
||||
TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT
|
||||
SHALL THE LICENSOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING BUT NOT
|
||||
LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
ANY WAY OUT OF THE USE OF THE SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
The Licensor's total aggregate liability under this licence shall
|
||||
not exceed zero (0) US Dollars (USD).
|
||||
|
||||
================================================================
|
||||
6. TERMINATION
|
||||
================================================================
|
||||
|
||||
This licence remains in effect until terminated.
|
||||
|
||||
The licence terminates automatically and without notice if you
|
||||
breach any of its terms. In particular, any Commercial Use
|
||||
without a separate written commercial licence from the Licensor
|
||||
constitutes immediate breach.
|
||||
|
||||
Upon termination, you must:
|
||||
|
||||
(a) cease all use of the Software;
|
||||
|
||||
(b) destroy all copies of the Software in your possession or
|
||||
control, including any modifications or derivative works;
|
||||
|
||||
(c) certify in writing to the Licensor, on request, that you
|
||||
have done so.
|
||||
|
||||
The Licensor may terminate this licence at any time by giving
|
||||
written notice to you.
|
||||
|
||||
The disclaimers in sections 4 and 5 survive termination.
|
||||
|
||||
================================================================
|
||||
7. NO TRANSFER OF OWNERSHIP
|
||||
================================================================
|
||||
|
||||
All right, title, and interest in and to the Software, including
|
||||
all intellectual property rights therein, remain with the
|
||||
Licensor. This licence does not transfer ownership of the
|
||||
Software, of any modification, or of any derivative work to the
|
||||
Licensee.
|
||||
|
||||
This licence does not grant you any rights to use the Licensor's
|
||||
name, trade marks, service marks, logos, or any other branding,
|
||||
except as required by section 8 (Attribution).
|
||||
|
||||
================================================================
|
||||
8. ATTRIBUTION
|
||||
================================================================
|
||||
|
||||
When publishing research, papers, theses, talks, or any other
|
||||
academic output that uses or relies on the Software, please cite
|
||||
the Licensor and the repository URL:
|
||||
|
||||
Sevana / Valeri Sitnikov, sem_cython12 (year of access),
|
||||
https://git.sevana.biz/vvs/sem_cython12
|
||||
|
||||
================================================================
|
||||
9. GOVERNING LAW
|
||||
================================================================
|
||||
|
||||
This licence is governed by and construed in accordance with the
|
||||
laws of the Republic of Estonia, without regard to conflict-of-laws
|
||||
principles.
|
||||
|
||||
Any disputes arising out of or in connection with this licence
|
||||
shall be resolved in the competent courts of the Republic of
|
||||
Estonia, to whose jurisdiction both parties submit.
|
||||
|
||||
================================================================
|
||||
10. ENTIRE AGREEMENT
|
||||
================================================================
|
||||
|
||||
This licence constitutes the entire agreement between you and the
|
||||
Licensor regarding the Software (other than any separate written
|
||||
commercial licence which may be executed) and supersedes all
|
||||
prior or contemporaneous understandings, written or oral.
|
||||
|
||||
If any provision of this licence is held to be unenforceable, the
|
||||
remaining provisions shall continue in full force and effect.
|
||||
|
||||
================================================================
|
||||
For commercial-licence inquiries: sales@sevana.biz
|
||||
================================================================
|
||||
|
||||
END OF LICENCE
|
||||
@@ -0,0 +1,128 @@
|
||||
# sem_cython12 - sample projects
|
||||
|
||||
Three short, runnable Python projects that demonstrate the `sem_cython12`
|
||||
library on small but realistic problems. Each demo is a single file,
|
||||
self-contained, and produces a clear printable result.
|
||||
|
||||
The demos use **only** `sem_cython12.wrapper`, `numpy`, and (for the
|
||||
Iris and anomaly demos) `scikit-learn`.
|
||||
|
||||
## What each demo shows
|
||||
|
||||
| File | Domain | "Wow" |
|
||||
|---|---|---|
|
||||
| [`01_iris_boundary.py`](./01_iris_boundary.py) | The 1936 Iris dataset | Rediscovers the famous versicolor/virginica boundary specimens **without training a classifier** and without setting any threshold. |
|
||||
| [`02_anomaly_detection.py`](./02_anomaly_detection.py) | Synthetic 5-D anomalies | Detects 10/10 injected anomalies with **a single function call** and matches/beats sklearn's IsolationForest on ROC AUC. |
|
||||
| [`03_multicriteria_selection.py`](./03_multicriteria_selection.py) | Multi-criteria candidate ranking | Identifies the **hidden all-rounders** that naive sum-of-scores ranking misses entirely. |
|
||||
|
||||
## Install
|
||||
|
||||
```bash
|
||||
# Get the library (private repo)
|
||||
git clone https://git.sevana.biz/vvs/sem_cython12.git ../sem_cython12
|
||||
export PYTHONPATH="$(pwd)/../sem_cython12:$PYTHONPATH"
|
||||
|
||||
# Demo dependencies
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
The pre-built Linux x86_64 / CPython 3.12 binary ships with the
|
||||
library; no compilation step is required.
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
python 01_iris_boundary.py
|
||||
python 02_anomaly_detection.py
|
||||
python 03_multicriteria_selection.py
|
||||
```
|
||||
|
||||
Each demo finishes in well under a second on a laptop.
|
||||
|
||||
## What you'll see
|
||||
|
||||
### 01_iris_boundary.py
|
||||
|
||||
```
|
||||
Auto-derived kernel scale lam = 3.4762
|
||||
|
||||
Top 10 most ambiguous specimens (highest cross-species score):
|
||||
|
||||
rank idx species sim->setosa sim->versic sim->virgin cross
|
||||
1 138 virginica 0.2330 0.9096 1.0000 0.9096
|
||||
2 70 versicolor 0.2396 1.0000 0.9096 0.9096
|
||||
3 127 virginica 0.2222 0.8806 1.0000 0.8806
|
||||
4 83 versicolor 0.2084 1.0000 0.8689 0.8689
|
||||
5 133 virginica 0.2062 0.8689 1.0000 0.8689
|
||||
...
|
||||
|
||||
Top 10 distribution by species:
|
||||
setosa : 0 of 10
|
||||
versicolor : 3 of 10
|
||||
virginica : 7 of 10
|
||||
|
||||
*** Confirmed: zero setosa specimens; the top-10 boundary cases ***
|
||||
*** all come from the famous versicolor/virginica overlap zone. ***
|
||||
```
|
||||
|
||||
### 02_anomaly_detection.py
|
||||
|
||||
```
|
||||
SEM (sem_cython12 - one batch_max_similarity call)
|
||||
Top-10 retrieved as anomalous: precision = 10/10
|
||||
ROC AUC = 1.0000
|
||||
|
||||
Baseline: sklearn IsolationForest (default settings)
|
||||
Top-10 retrieved as anomalous: precision = 10/10
|
||||
ROC AUC = 1.0000
|
||||
|
||||
SEM matches IsolationForest within noise (+0.0000 AUC),
|
||||
with one function call and zero tuning.
|
||||
```
|
||||
|
||||
### 03_multicriteria_selection.py
|
||||
|
||||
```
|
||||
Best-tradeoff frontier size : 35
|
||||
Cross-criterion winners (NRW) : 31
|
||||
Hidden all-rounders we injected : 5 (indices 0-4)
|
||||
|
||||
NRW recovered hidden all-rounders : 5/5 [0, 1, 2, 3, 4]
|
||||
Naive top-10 found hidden all-rounders: 3/5 [1, 2, 3]
|
||||
|
||||
*** SEM's NRW filter recovered 5/5 hidden all-rounders. ***
|
||||
*** Naive sum-of-scores top-10 found only 3/5. ***
|
||||
*** SEM surfaces 2 candidates the naive ranking misses ***
|
||||
*** because they don't peak on any single criterion. ***
|
||||
```
|
||||
|
||||
## What to try next
|
||||
|
||||
- Replace the synthetic data in `02_*` with your own observations and
|
||||
see what gets flagged.
|
||||
- Replace the synthetic candidate matrix in `03_*` with your
|
||||
real-world multi-criteria evaluation (job applicants, vendor
|
||||
proposals, product features, drug screens).
|
||||
- Extend `01_*` to your own classification problems: any time you
|
||||
have multiple classes with overlapping members, the NRW operator
|
||||
surfaces the structurally informative boundary cases.
|
||||
|
||||
The library has more capabilities than these three demos exercise.
|
||||
See the `sem_cython12.wrapper` API for the full operator set
|
||||
(pairwise distances, multi-class similarity matrix, incremental
|
||||
aggregation, etc.).
|
||||
|
||||
## Licence
|
||||
|
||||
The demos and the underlying `sem_cython12` library are licensed
|
||||
under the terms in the [LICENSE](./LICENSE) file:
|
||||
|
||||
- Research and non-commercial use: free under the conditions
|
||||
stated in the licence.
|
||||
- Commercial use: requires a separate written commercial licence.
|
||||
Contact `sales@sevana.biz`.
|
||||
- The Software is provided strictly "AS IS", without warranty of
|
||||
any kind.
|
||||
|
||||
Please read the LICENSE file in full before using the demos or the
|
||||
underlying library.
|
||||
@@ -0,0 +1,2 @@
|
||||
numpy>=1.23
|
||||
scikit-learn>=1.0
|
||||
Reference in New Issue
Block a user