Sanitize wrapper docstrings + README: remove kernel formula and metric-specific exposure

The previous version exposed: - exp(-d/lam) as the literal similarity-kernel form - 'Euclidean' as the literal distance metric - the O1+O2 conditions of the one-sided-mask routine in both the Python docstrings and the README API tables. Replaced with operational descriptions: 'similarity score in [0,1] against the closest member', 'distance matrix between rows', etc. The library's behaviour and call signatures are unchanged.
2026-05-09 14:22:01 +01:00
parent 39d9f0d71a
commit 3e588f8024
2 changed files with 39 additions and 36 deletions
@@ -75,18 +75,18 @@ internally cast to contiguous `float64`.  Outputs are numpy arrays.

 | Function | Inputs | Output |
 |---|---|---|
-| `batch_max_similarity(X_query, X_members, lam)` | `(Q, D)`, `(M, D)`, `lam > 0` | `(Q,)` - per-query max of `exp(-d / lam)` |
-| `concept_support_matrix(X_query, member_mats, lam)` | `(Q, D)`, list of `(M_k, D)`, `lam > 0` | `(Q, K)` - one column per member matrix |
-| `pairwise_distances(X)` | `(N, D)` | `(N, N)` - symmetric Euclidean matrix |
+| `batch_max_similarity(X_query, X_members, lam)` | `(Q, D)`, `(M, D)`, `lam > 0` | `(Q,)` - per-query similarity score in `[0, 1]` against the closest member |
+| `concept_support_matrix(X_query, member_mats, lam)` | `(Q, D)`, list of `(M_k, D)`, `lam > 0` | `(Q, K)` - one similarity column per member set |
+| `pairwise_distances(X)` | `(N, D)` | `(N, N)` - symmetric distance matrix between rows |
 | `nn_distances(X)` | `(N, D)` | `(N,)` - min positive distance per row; `inf` if none |

-### Pareto / dominance
+### Best-tradeoff filtering

 | Function | Inputs | Output |
 |---|---|---|
-| `pareto_core_mask(S)` | `(N, k)` | `(N,)` byte mask: `1` iff row not strictly dominated |
-| `one_sided_mask(S)` | `(N, k)` | `(N, k)` byte mask: see docstring |
-| `non_redundant_witnesses(S)` | `(N, k)` | int32 array of row indices |
+| `pareto_core_mask(S)` | `(N, k)` | `(N,)` byte mask: rows that survive the multi-objective best-tradeoff filter |
+| `one_sided_mask(S)` | `(N, k)` | `(N, k)` byte mask: rows contributing meaningfully on a single column only |
+| `non_redundant_witnesses(S)` | `(N, k)` | int32 array of row indices contributing meaningfully across multiple columns |

 ### Vector reduction

@@ -65,10 +65,12 @@ def set_num_threads(n: int) -> None:


 def batch_max_similarity(X_query, X_members, lam: float) -> np.ndarray:
-    """For each row of ``X_query`` (shape ``(Q, D)``), return the
-    maximum of ``exp(-||q - m|| / lam)`` over rows of ``X_members``
-    (shape ``(M, D)``).  Output shape: ``(Q,)``.  Empty members
-    array yields zeros.  ``lam`` must be > 0."""
+    """For each row of ``X_query`` (shape ``(Q, D)``), return a
+    similarity score in ``[0, 1]`` summarising how close the row is
+    to the most similar row of ``X_members`` (shape ``(M, D)``).
+    ``lam`` is the kernel scale (> 0) - smaller values produce a
+    narrower kernel.  Output shape: ``(Q,)``.  Empty ``X_members``
+    yields a zeros vector."""
    if not _HAS_CYTHON:
        raise RuntimeError("sem_cython12 not available; import guarded caller")
    Xq = np.ascontiguousarray(X_query, dtype=np.float64)
@@ -79,11 +81,12 @@ def batch_max_similarity(X_query, X_members, lam: float) -> np.ndarray:


 def concept_support_matrix(X_query, member_mats, lam: float) -> np.ndarray:
-    """For each row of ``X_query`` (shape ``(Q, D)``) and each member
-    matrix in ``member_mats`` (a list of ``(M_k, D)`` arrays; M_k may
-    differ), return the ``(Q, K)`` matrix whose entry ``(q, k)`` is
-    ``batch_max_similarity(X_query[q:q+1], member_mats[k], lam)``.
-    Empty member matrices contribute a column of zeros."""
+    """``batch_max_similarity`` evaluated against ``K`` independent
+    reference sets in one call.  ``X_query`` has shape ``(Q, D)`` and
+    ``member_mats`` is a list of ``(M_k, D)`` arrays (``M_k`` may
+    differ per ``k``).  Output is a ``(Q, K)`` matrix; column ``k``
+    is the similarity vector against ``member_mats[k]``.  Empty
+    member matrices contribute a column of zeros."""
    if not _HAS_CYTHON:
        raise RuntimeError("sem_cython12 not available; import guarded caller")
    Xq = np.ascontiguousarray(X_query, dtype=np.float64)
@@ -92,10 +95,9 @@ def concept_support_matrix(X_query, member_mats, lam: float) -> np.ndarray:

 def pareto_core_mask(S) -> np.ndarray:
    """Given a real-valued matrix ``S`` of shape ``(N, k)``, return a
-    byte mask of shape ``(N,)`` where ``mask[i] == 1`` iff row ``i``
-    is not strictly dominated by any other row in the
-    maximisation order (``a`` dominates ``b`` iff ``a >= b``
-    componentwise and ``a != b``)."""
+    byte mask of shape ``(N,)`` flagging the rows that survive a
+    multi-objective best-tradeoff filter (the "Pareto core") in the
+    maximisation order."""
    if not _HAS_CYTHON:
        raise RuntimeError("sem_cython12 not available; import guarded caller")
    return cy12_pareto_core_mask(np.ascontiguousarray(S, dtype=np.float64))
@@ -103,10 +105,9 @@ def pareto_core_mask(S) -> np.ndarray:

 def one_sided_mask(S) -> np.ndarray:
    """Given ``S`` of shape ``(N, k)``, return a byte mask of shape
-    ``(N, k)`` where ``mask[w, i] == 1`` iff row ``w`` attains the
-    column-i maximum AND, on every other column ``j``, the value
-    ``S[w, j]`` is strictly below the max of column ``j`` taken over
-    all rows other than ``w``."""
+    ``(N, k)`` flagging row/column pairs ``(w, i)`` where row ``w``
+    contributes meaningfully on column ``i`` only.  Used together
+    with ``pareto_core_mask`` to build ``non_redundant_witnesses``."""
    if not _HAS_CYTHON:
        raise RuntimeError("sem_cython12 not available; import guarded caller")
    return cy12_one_sided_mask(np.ascontiguousarray(S, dtype=np.float64))
@@ -114,16 +115,17 @@ def one_sided_mask(S) -> np.ndarray:

 def non_redundant_witnesses(S) -> np.ndarray:
    """Given ``S`` of shape ``(N, k)``, return a 1-D int32 array of
-    row indices that are in the Pareto core AND not flagged by
-    ``one_sided_mask`` on any column."""
+    row indices that survive both the best-tradeoff filter and the
+    one-sided filter - i.e., rows that contribute meaningfully across
+    multiple columns rather than peaking on one alone."""
    if not _HAS_CYTHON:
        raise RuntimeError("sem_cython12 not available; import guarded caller")
    return cy12_non_redundant_witnesses(np.ascontiguousarray(S, dtype=np.float64))


 def pairwise_distances(X) -> np.ndarray:
-    """Symmetric ``(N, N)`` Euclidean distance matrix from rows of
-    ``X`` (shape ``(N, D)``).  Computed with parallel upper-triangle
+    """Symmetric ``(N, N)`` distance matrix between rows of ``X``
+    (shape ``(N, D)``).  Computed with parallel upper-triangle
    iteration; the lower triangle is mirrored."""
    if not _HAS_CYTHON:
        raise RuntimeError("sem_cython12 not available; import guarded caller")
@@ -132,7 +134,7 @@ def pairwise_distances(X) -> np.ndarray:

 def nn_distances(X) -> np.ndarray:
    """For each row of ``X`` (shape ``(N, D)``), return the minimum
-    POSITIVE Euclidean distance to any other row.  Rows for which no
+    POSITIVE distance to any other row.  Rows for which no
    positive-distance neighbour exists receive ``inf`` so the caller
    can filter them with ``np.isfinite``."""
    if not _HAS_CYTHON:
@@ -141,13 +143,14 @@ def nn_distances(X) -> np.ndarray:


 def extend_frontier_kernel(cur_centers, cur_radii, new_emb, cur_arity: int):
-    """Fused (centroid + radius) bulk computation.  For each pair
-    ``(f, a)`` with ``f`` in ``range(F)`` and ``a`` in ``range(A)``:
-
-        new_center = (cur_arity * cur_centers[f] + new_emb[a]) / (cur_arity + 1)
-        norm_new   = || new_emb[a] - new_center ||
-        new_radius = max(1e-12,
-                         (cur_radii[f] * cur_arity + norm_new) / (cur_arity + 1))
+    """Fused bulk update for an incremental aggregation step.  Given
+    ``F`` reference items - each summarised by a centre vector
+    (``cur_centers``, shape ``(F, D)``) and a radius
+    (``cur_radii``, shape ``(F,)``) representing the dispersion of
+    ``cur_arity`` underlying points - and ``A`` candidate new
+    contributions (``new_emb``, shape ``(A, D)``), produce all
+    ``F * A`` updated (centre, radius) pairs that result from
+    appending one candidate to one reference item.

    Returns ``(flat_new_centers, flat_radii)`` with shapes
    ``((F*A, D), (F*A,))`` packed in row-major (f, a) order.