Vendor demos under demos/ and link from README for landing-page visibility

2026-05-09 15:25:52 +01:00
parent 6597509f79
commit c886ded981
5 changed files with 452 additions and 0 deletions
@@ -0,0 +1,99 @@
+"""Demo 1 - Iris boundary rediscovery (no training).
+
+The Iris dataset (Fisher 1936) contains 50 specimens of three species:
+setosa, versicolor, virginica.  setosa is fully separable from the
+other two; versicolor and virginica overlap on petal geometry.  Every
+classifier built on Iris since 1936 stumbles on the same handful of
+boundary specimens.
+
+We find them WITHOUT training a classifier:
+
+  1. Group specimens by species.
+  2. Auto-derive a kernel scale from the data's own geometry.
+  3. Compute the (150, 3) similarity matrix.
+  4. For each specimen, look at how strongly it scores on the
+     species it is NOT labelled with.  Highest cross-species score
+     ranks the most ambiguous specimens.
+
+Run:
+    python 01_iris_boundary.py
+"""
+
+from __future__ import annotations
+
+import numpy as np
+from sklearn.datasets import load_iris
+
+from sem_cython12 import wrapper as cy
+
+
+def main() -> int:
+    if not cy.available():
+        print("ERROR: sem_cython12 compiled extension did not load.")
+        return 1
+
+    iris = load_iris()
+    X = iris.data                           # (150, 4)
+    y = iris.target                         # (150,)
+    species_names = iris.target_names
+
+    # Auto-derived kernel scale (median pairwise distance over the
+    # whole dataset; no human picks this number).
+    pd = cy.pairwise_distances(X)
+    iu = np.triu_indices(pd.shape[0], k=1)
+    lam = float(np.median(pd[iu]))
+    print(f"Auto-derived kernel scale lam = {lam:.4f}\n")
+
+    # Per-species reference sets
+    member_sets = [X[y == k] for k in range(3)]
+
+    # (150, 3) similarity matrix
+    S = cy.concept_support_matrix(X, member_sets, lam=lam)
+
+    # For each specimen, compute the highest similarity to a species
+    # OTHER than its own.  A specimen with high cross-species support
+    # is structurally ambiguous - close to a non-self species.
+    cross_score = np.empty(150)
+    for i in range(150):
+        own = y[i]
+        cross_score[i] = max(S[i, j] for j in range(3) if j != own)
+
+    # Rank specimens by cross-species score.  Top entries = the famous
+    # boundary cases.
+    order = np.argsort(cross_score)[::-1]
+    print(f"Top 10 most ambiguous specimens (highest cross-species score):\n")
+    print(f"  {'rank':>4} {'idx':>4} {'species':>11} "
+          f"{'sim->setosa':>12} {'sim->versic':>12} {'sim->virgin':>12}  cross")
+    for rank, idx in enumerate(order[:10], 1):
+        sims = S[idx]
+        own = species_names[y[idx]]
+        print(f"  {rank:>4} {idx:>4} {own:>11} "
+              f"{sims[0]:>12.4f} {sims[1]:>12.4f} {sims[2]:>12.4f}  {cross_score[idx]:.4f}")
+
+    # Distribution of those top 10 by species
+    top10_species = [int(y[i]) for i in order[:10]]
+    counts = {0: 0, 1: 0, 2: 0}
+    for s in top10_species:
+        counts[s] += 1
+
+    print()
+    print("Top 10 distribution by species:")
+    for k, name in enumerate(species_names):
+        print(f"  {name:12s}: {counts[k]} of 10")
+
+    print()
+    print("Observation:")
+    print("  setosa is fully separable from the other two (Fisher 1936),")
+    print("  so we expect zero or near-zero setosa specimens in the top 10.")
+    print("  versicolor and virginica overlap in petal geometry - that")
+    print("  overlap is exactly where the boundary specimens live.")
+
+    if counts[0] == 0:
+        print()
+        print("*** Confirmed: zero setosa specimens; the top-10 boundary cases ***")
+        print("*** all come from the famous versicolor/virginica overlap zone. ***")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())