sem_cython12/demos/01_iris_boundary.py

"""Demo 1 - Iris boundary rediscovery (no training).

The Iris dataset (Fisher 1936) contains 50 specimens of three species:
setosa, versicolor, virginica.  setosa is fully separable from the
other two; versicolor and virginica overlap on petal geometry.  Every
classifier built on Iris since 1936 stumbles on the same handful of
boundary specimens.

We find them WITHOUT training a classifier:

  1. Group specimens by species.
  2. Auto-derive a kernel scale from the data's own geometry.
  3. Compute the (150, 3) similarity matrix.
  4. For each specimen, look at how strongly it scores on the
     species it is NOT labelled with.  Highest cross-species score
     ranks the most ambiguous specimens.

Run:
    python 01_iris_boundary.py
"""

from __future__ import annotations

import numpy as np
from sklearn.datasets import load_iris

from sem_cython12 import wrapper as cy


def main() -> int:
    if not cy.available():
        print("ERROR: sem_cython12 compiled extension did not load.")
        return 1

    iris = load_iris()
    X = iris.data                           # (150, 4)
    y = iris.target                         # (150,)
    species_names = iris.target_names

    # Auto-derived kernel scale (median pairwise distance over the
    # whole dataset; no human picks this number).
    pd = cy.pairwise_distances(X)
    iu = np.triu_indices(pd.shape[0], k=1)
    lam = float(np.median(pd[iu]))
    print(f"Auto-derived kernel scale lam = {lam:.4f}\n")

    # Per-species reference sets
    member_sets = [X[y == k] for k in range(3)]

    # (150, 3) similarity matrix
    S = cy.concept_support_matrix(X, member_sets, lam=lam)

    # For each specimen, compute the highest similarity to a species
    # OTHER than its own.  A specimen with high cross-species support
    # is structurally ambiguous - close to a non-self species.
    cross_score = np.empty(150)
    for i in range(150):
        own = y[i]
        cross_score[i] = max(S[i, j] for j in range(3) if j != own)

    # Rank specimens by cross-species score.  Top entries = the famous
    # boundary cases.
    order = np.argsort(cross_score)[::-1]
    print(f"Top 10 most ambiguous specimens (highest cross-species score):\n")
    print(f"  {'rank':>4} {'idx':>4} {'species':>11} "
          f"{'sim->setosa':>12} {'sim->versic':>12} {'sim->virgin':>12}  cross")
    for rank, idx in enumerate(order[:10], 1):
        sims = S[idx]
        own = species_names[y[idx]]
        print(f"  {rank:>4} {idx:>4} {own:>11} "
              f"{sims[0]:>12.4f} {sims[1]:>12.4f} {sims[2]:>12.4f}  {cross_score[idx]:.4f}")

    # Distribution of those top 10 by species
    top10_species = [int(y[i]) for i in order[:10]]
    counts = {0: 0, 1: 0, 2: 0}
    for s in top10_species:
        counts[s] += 1

    print()
    print("Top 10 distribution by species:")
    for k, name in enumerate(species_names):
        print(f"  {name:12s}: {counts[k]} of 10")

    print()
    print("Observation:")
    print("  setosa is fully separable from the other two (Fisher 1936),")
    print("  so we expect zero or near-zero setosa specimens in the top 10.")
    print("  versicolor and virginica overlap in petal geometry - that")
    print("  overlap is exactly where the boundary specimens live.")

    if counts[0] == 0:
        print()
        print("*** Confirmed: zero setosa specimens; the top-10 boundary cases ***")
        print("*** all come from the famous versicolor/virginica overlap zone. ***")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())