"""Demo 1 - Iris boundary rediscovery (no training). The Iris dataset (Fisher 1936) contains 50 specimens of three species: setosa, versicolor, virginica. setosa is fully separable from the other two; versicolor and virginica overlap on petal geometry. Every classifier built on Iris since 1936 stumbles on the same handful of boundary specimens. We find them WITHOUT training a classifier: 1. Group specimens by species. 2. Auto-derive a kernel scale from the data's own geometry. 3. Compute the (150, 3) similarity matrix. 4. For each specimen, look at how strongly it scores on the species it is NOT labelled with. Highest cross-species score ranks the most ambiguous specimens. Run: python 01_iris_boundary.py """ from __future__ import annotations import numpy as np from sklearn.datasets import load_iris from sem_cython12 import wrapper as cy def main() -> int: if not cy.available(): print("ERROR: sem_cython12 compiled extension did not load.") return 1 iris = load_iris() X = iris.data # (150, 4) y = iris.target # (150,) species_names = iris.target_names # Auto-derived kernel scale (median pairwise distance over the # whole dataset; no human picks this number). pd = cy.pairwise_distances(X) iu = np.triu_indices(pd.shape[0], k=1) lam = float(np.median(pd[iu])) print(f"Auto-derived kernel scale lam = {lam:.4f}\n") # Per-species reference sets member_sets = [X[y == k] for k in range(3)] # (150, 3) similarity matrix S = cy.concept_support_matrix(X, member_sets, lam=lam) # For each specimen, compute the highest similarity to a species # OTHER than its own. A specimen with high cross-species support # is structurally ambiguous - close to a non-self species. cross_score = np.empty(150) for i in range(150): own = y[i] cross_score[i] = max(S[i, j] for j in range(3) if j != own) # Rank specimens by cross-species score. Top entries = the famous # boundary cases. order = np.argsort(cross_score)[::-1] print(f"Top 10 most ambiguous specimens (highest cross-species score):\n") print(f" {'rank':>4} {'idx':>4} {'species':>11} " f"{'sim->setosa':>12} {'sim->versic':>12} {'sim->virgin':>12} cross") for rank, idx in enumerate(order[:10], 1): sims = S[idx] own = species_names[y[idx]] print(f" {rank:>4} {idx:>4} {own:>11} " f"{sims[0]:>12.4f} {sims[1]:>12.4f} {sims[2]:>12.4f} {cross_score[idx]:.4f}") # Distribution of those top 10 by species top10_species = [int(y[i]) for i in order[:10]] counts = {0: 0, 1: 0, 2: 0} for s in top10_species: counts[s] += 1 print() print("Top 10 distribution by species:") for k, name in enumerate(species_names): print(f" {name:12s}: {counts[k]} of 10") print() print("Observation:") print(" setosa is fully separable from the other two (Fisher 1936),") print(" so we expect zero or near-zero setosa specimens in the top 10.") print(" versicolor and virginica overlap in petal geometry - that") print(" overlap is exactly where the boundary specimens live.") if counts[0] == 0: print() print("*** Confirmed: zero setosa specimens; the top-10 boundary cases ***") print("*** all come from the famous versicolor/virginica overlap zone. ***") return 0 if __name__ == "__main__": raise SystemExit(main())