Vendor demos under demos/ and link from README for landing-page visibility
This commit is contained in:
@@ -0,0 +1,99 @@
|
||||
"""Demo 1 - Iris boundary rediscovery (no training).
|
||||
|
||||
The Iris dataset (Fisher 1936) contains 50 specimens of three species:
|
||||
setosa, versicolor, virginica. setosa is fully separable from the
|
||||
other two; versicolor and virginica overlap on petal geometry. Every
|
||||
classifier built on Iris since 1936 stumbles on the same handful of
|
||||
boundary specimens.
|
||||
|
||||
We find them WITHOUT training a classifier:
|
||||
|
||||
1. Group specimens by species.
|
||||
2. Auto-derive a kernel scale from the data's own geometry.
|
||||
3. Compute the (150, 3) similarity matrix.
|
||||
4. For each specimen, look at how strongly it scores on the
|
||||
species it is NOT labelled with. Highest cross-species score
|
||||
ranks the most ambiguous specimens.
|
||||
|
||||
Run:
|
||||
python 01_iris_boundary.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_iris
|
||||
|
||||
from sem_cython12 import wrapper as cy
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if not cy.available():
|
||||
print("ERROR: sem_cython12 compiled extension did not load.")
|
||||
return 1
|
||||
|
||||
iris = load_iris()
|
||||
X = iris.data # (150, 4)
|
||||
y = iris.target # (150,)
|
||||
species_names = iris.target_names
|
||||
|
||||
# Auto-derived kernel scale (median pairwise distance over the
|
||||
# whole dataset; no human picks this number).
|
||||
pd = cy.pairwise_distances(X)
|
||||
iu = np.triu_indices(pd.shape[0], k=1)
|
||||
lam = float(np.median(pd[iu]))
|
||||
print(f"Auto-derived kernel scale lam = {lam:.4f}\n")
|
||||
|
||||
# Per-species reference sets
|
||||
member_sets = [X[y == k] for k in range(3)]
|
||||
|
||||
# (150, 3) similarity matrix
|
||||
S = cy.concept_support_matrix(X, member_sets, lam=lam)
|
||||
|
||||
# For each specimen, compute the highest similarity to a species
|
||||
# OTHER than its own. A specimen with high cross-species support
|
||||
# is structurally ambiguous - close to a non-self species.
|
||||
cross_score = np.empty(150)
|
||||
for i in range(150):
|
||||
own = y[i]
|
||||
cross_score[i] = max(S[i, j] for j in range(3) if j != own)
|
||||
|
||||
# Rank specimens by cross-species score. Top entries = the famous
|
||||
# boundary cases.
|
||||
order = np.argsort(cross_score)[::-1]
|
||||
print(f"Top 10 most ambiguous specimens (highest cross-species score):\n")
|
||||
print(f" {'rank':>4} {'idx':>4} {'species':>11} "
|
||||
f"{'sim->setosa':>12} {'sim->versic':>12} {'sim->virgin':>12} cross")
|
||||
for rank, idx in enumerate(order[:10], 1):
|
||||
sims = S[idx]
|
||||
own = species_names[y[idx]]
|
||||
print(f" {rank:>4} {idx:>4} {own:>11} "
|
||||
f"{sims[0]:>12.4f} {sims[1]:>12.4f} {sims[2]:>12.4f} {cross_score[idx]:.4f}")
|
||||
|
||||
# Distribution of those top 10 by species
|
||||
top10_species = [int(y[i]) for i in order[:10]]
|
||||
counts = {0: 0, 1: 0, 2: 0}
|
||||
for s in top10_species:
|
||||
counts[s] += 1
|
||||
|
||||
print()
|
||||
print("Top 10 distribution by species:")
|
||||
for k, name in enumerate(species_names):
|
||||
print(f" {name:12s}: {counts[k]} of 10")
|
||||
|
||||
print()
|
||||
print("Observation:")
|
||||
print(" setosa is fully separable from the other two (Fisher 1936),")
|
||||
print(" so we expect zero or near-zero setosa specimens in the top 10.")
|
||||
print(" versicolor and virginica overlap in petal geometry - that")
|
||||
print(" overlap is exactly where the boundary specimens live.")
|
||||
|
||||
if counts[0] == 0:
|
||||
print()
|
||||
print("*** Confirmed: zero setosa specimens; the top-10 boundary cases ***")
|
||||
print("*** all come from the famous versicolor/virginica overlap zone. ***")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user