100 lines
3.5 KiB
Python
100 lines
3.5 KiB
Python
"""Demo 1 - Iris boundary rediscovery (no training).
|
|
|
|
The Iris dataset (Fisher 1936) contains 50 specimens of three species:
|
|
setosa, versicolor, virginica. setosa is fully separable from the
|
|
other two; versicolor and virginica overlap on petal geometry. Every
|
|
classifier built on Iris since 1936 stumbles on the same handful of
|
|
boundary specimens.
|
|
|
|
We find them WITHOUT training a classifier:
|
|
|
|
1. Group specimens by species.
|
|
2. Auto-derive a kernel scale from the data's own geometry.
|
|
3. Compute the (150, 3) similarity matrix.
|
|
4. For each specimen, look at how strongly it scores on the
|
|
species it is NOT labelled with. Highest cross-species score
|
|
ranks the most ambiguous specimens.
|
|
|
|
Run:
|
|
python 01_iris_boundary.py
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import numpy as np
|
|
from sklearn.datasets import load_iris
|
|
|
|
from sem_cython12 import wrapper as cy
|
|
|
|
|
|
def main() -> int:
|
|
if not cy.available():
|
|
print("ERROR: sem_cython12 compiled extension did not load.")
|
|
return 1
|
|
|
|
iris = load_iris()
|
|
X = iris.data # (150, 4)
|
|
y = iris.target # (150,)
|
|
species_names = iris.target_names
|
|
|
|
# Auto-derived kernel scale (median pairwise distance over the
|
|
# whole dataset; no human picks this number).
|
|
pd = cy.pairwise_distances(X)
|
|
iu = np.triu_indices(pd.shape[0], k=1)
|
|
lam = float(np.median(pd[iu]))
|
|
print(f"Auto-derived kernel scale lam = {lam:.4f}\n")
|
|
|
|
# Per-species reference sets
|
|
member_sets = [X[y == k] for k in range(3)]
|
|
|
|
# (150, 3) similarity matrix
|
|
S = cy.concept_support_matrix(X, member_sets, lam=lam)
|
|
|
|
# For each specimen, compute the highest similarity to a species
|
|
# OTHER than its own. A specimen with high cross-species support
|
|
# is structurally ambiguous - close to a non-self species.
|
|
cross_score = np.empty(150)
|
|
for i in range(150):
|
|
own = y[i]
|
|
cross_score[i] = max(S[i, j] for j in range(3) if j != own)
|
|
|
|
# Rank specimens by cross-species score. Top entries = the famous
|
|
# boundary cases.
|
|
order = np.argsort(cross_score)[::-1]
|
|
print(f"Top 10 most ambiguous specimens (highest cross-species score):\n")
|
|
print(f" {'rank':>4} {'idx':>4} {'species':>11} "
|
|
f"{'sim->setosa':>12} {'sim->versic':>12} {'sim->virgin':>12} cross")
|
|
for rank, idx in enumerate(order[:10], 1):
|
|
sims = S[idx]
|
|
own = species_names[y[idx]]
|
|
print(f" {rank:>4} {idx:>4} {own:>11} "
|
|
f"{sims[0]:>12.4f} {sims[1]:>12.4f} {sims[2]:>12.4f} {cross_score[idx]:.4f}")
|
|
|
|
# Distribution of those top 10 by species
|
|
top10_species = [int(y[i]) for i in order[:10]]
|
|
counts = {0: 0, 1: 0, 2: 0}
|
|
for s in top10_species:
|
|
counts[s] += 1
|
|
|
|
print()
|
|
print("Top 10 distribution by species:")
|
|
for k, name in enumerate(species_names):
|
|
print(f" {name:12s}: {counts[k]} of 10")
|
|
|
|
print()
|
|
print("Observation:")
|
|
print(" setosa is fully separable from the other two (Fisher 1936),")
|
|
print(" so we expect zero or near-zero setosa specimens in the top 10.")
|
|
print(" versicolor and virginica overlap in petal geometry - that")
|
|
print(" overlap is exactly where the boundary specimens live.")
|
|
|
|
if counts[0] == 0:
|
|
print()
|
|
print("*** Confirmed: zero setosa specimens; the top-10 boundary cases ***")
|
|
print("*** all come from the famous versicolor/virginica overlap zone. ***")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|