diff --git a/README.md b/README.md index 649b7cc..b2daddb 100644 --- a/README.md +++ b/README.md @@ -75,18 +75,18 @@ internally cast to contiguous `float64`. Outputs are numpy arrays. | Function | Inputs | Output | |---|---|---| -| `batch_max_similarity(X_query, X_members, lam)` | `(Q, D)`, `(M, D)`, `lam > 0` | `(Q,)` - per-query max of `exp(-d / lam)` | -| `concept_support_matrix(X_query, member_mats, lam)` | `(Q, D)`, list of `(M_k, D)`, `lam > 0` | `(Q, K)` - one column per member matrix | -| `pairwise_distances(X)` | `(N, D)` | `(N, N)` - symmetric Euclidean matrix | +| `batch_max_similarity(X_query, X_members, lam)` | `(Q, D)`, `(M, D)`, `lam > 0` | `(Q,)` - per-query similarity score in `[0, 1]` against the closest member | +| `concept_support_matrix(X_query, member_mats, lam)` | `(Q, D)`, list of `(M_k, D)`, `lam > 0` | `(Q, K)` - one similarity column per member set | +| `pairwise_distances(X)` | `(N, D)` | `(N, N)` - symmetric distance matrix between rows | | `nn_distances(X)` | `(N, D)` | `(N,)` - min positive distance per row; `inf` if none | -### Pareto / dominance +### Best-tradeoff filtering | Function | Inputs | Output | |---|---|---| -| `pareto_core_mask(S)` | `(N, k)` | `(N,)` byte mask: `1` iff row not strictly dominated | -| `one_sided_mask(S)` | `(N, k)` | `(N, k)` byte mask: see docstring | -| `non_redundant_witnesses(S)` | `(N, k)` | int32 array of row indices | +| `pareto_core_mask(S)` | `(N, k)` | `(N,)` byte mask: rows that survive the multi-objective best-tradeoff filter | +| `one_sided_mask(S)` | `(N, k)` | `(N, k)` byte mask: rows contributing meaningfully on a single column only | +| `non_redundant_witnesses(S)` | `(N, k)` | int32 array of row indices contributing meaningfully across multiple columns | ### Vector reduction diff --git a/sem_cython12/wrapper.py b/sem_cython12/wrapper.py index b7b4f90..da8821d 100644 --- a/sem_cython12/wrapper.py +++ b/sem_cython12/wrapper.py @@ -65,10 +65,12 @@ def set_num_threads(n: int) -> None: def batch_max_similarity(X_query, X_members, lam: float) -> np.ndarray: - """For each row of ``X_query`` (shape ``(Q, D)``), return the - maximum of ``exp(-||q - m|| / lam)`` over rows of ``X_members`` - (shape ``(M, D)``). Output shape: ``(Q,)``. Empty members - array yields zeros. ``lam`` must be > 0.""" + """For each row of ``X_query`` (shape ``(Q, D)``), return a + similarity score in ``[0, 1]`` summarising how close the row is + to the most similar row of ``X_members`` (shape ``(M, D)``). + ``lam`` is the kernel scale (> 0) - smaller values produce a + narrower kernel. Output shape: ``(Q,)``. Empty ``X_members`` + yields a zeros vector.""" if not _HAS_CYTHON: raise RuntimeError("sem_cython12 not available; import guarded caller") Xq = np.ascontiguousarray(X_query, dtype=np.float64) @@ -79,11 +81,12 @@ def batch_max_similarity(X_query, X_members, lam: float) -> np.ndarray: def concept_support_matrix(X_query, member_mats, lam: float) -> np.ndarray: - """For each row of ``X_query`` (shape ``(Q, D)``) and each member - matrix in ``member_mats`` (a list of ``(M_k, D)`` arrays; M_k may - differ), return the ``(Q, K)`` matrix whose entry ``(q, k)`` is - ``batch_max_similarity(X_query[q:q+1], member_mats[k], lam)``. - Empty member matrices contribute a column of zeros.""" + """``batch_max_similarity`` evaluated against ``K`` independent + reference sets in one call. ``X_query`` has shape ``(Q, D)`` and + ``member_mats`` is a list of ``(M_k, D)`` arrays (``M_k`` may + differ per ``k``). Output is a ``(Q, K)`` matrix; column ``k`` + is the similarity vector against ``member_mats[k]``. Empty + member matrices contribute a column of zeros.""" if not _HAS_CYTHON: raise RuntimeError("sem_cython12 not available; import guarded caller") Xq = np.ascontiguousarray(X_query, dtype=np.float64) @@ -92,10 +95,9 @@ def concept_support_matrix(X_query, member_mats, lam: float) -> np.ndarray: def pareto_core_mask(S) -> np.ndarray: """Given a real-valued matrix ``S`` of shape ``(N, k)``, return a - byte mask of shape ``(N,)`` where ``mask[i] == 1`` iff row ``i`` - is not strictly dominated by any other row in the - maximisation order (``a`` dominates ``b`` iff ``a >= b`` - componentwise and ``a != b``).""" + byte mask of shape ``(N,)`` flagging the rows that survive a + multi-objective best-tradeoff filter (the "Pareto core") in the + maximisation order.""" if not _HAS_CYTHON: raise RuntimeError("sem_cython12 not available; import guarded caller") return cy12_pareto_core_mask(np.ascontiguousarray(S, dtype=np.float64)) @@ -103,10 +105,9 @@ def pareto_core_mask(S) -> np.ndarray: def one_sided_mask(S) -> np.ndarray: """Given ``S`` of shape ``(N, k)``, return a byte mask of shape - ``(N, k)`` where ``mask[w, i] == 1`` iff row ``w`` attains the - column-i maximum AND, on every other column ``j``, the value - ``S[w, j]`` is strictly below the max of column ``j`` taken over - all rows other than ``w``.""" + ``(N, k)`` flagging row/column pairs ``(w, i)`` where row ``w`` + contributes meaningfully on column ``i`` only. Used together + with ``pareto_core_mask`` to build ``non_redundant_witnesses``.""" if not _HAS_CYTHON: raise RuntimeError("sem_cython12 not available; import guarded caller") return cy12_one_sided_mask(np.ascontiguousarray(S, dtype=np.float64)) @@ -114,16 +115,17 @@ def one_sided_mask(S) -> np.ndarray: def non_redundant_witnesses(S) -> np.ndarray: """Given ``S`` of shape ``(N, k)``, return a 1-D int32 array of - row indices that are in the Pareto core AND not flagged by - ``one_sided_mask`` on any column.""" + row indices that survive both the best-tradeoff filter and the + one-sided filter - i.e., rows that contribute meaningfully across + multiple columns rather than peaking on one alone.""" if not _HAS_CYTHON: raise RuntimeError("sem_cython12 not available; import guarded caller") return cy12_non_redundant_witnesses(np.ascontiguousarray(S, dtype=np.float64)) def pairwise_distances(X) -> np.ndarray: - """Symmetric ``(N, N)`` Euclidean distance matrix from rows of - ``X`` (shape ``(N, D)``). Computed with parallel upper-triangle + """Symmetric ``(N, N)`` distance matrix between rows of ``X`` + (shape ``(N, D)``). Computed with parallel upper-triangle iteration; the lower triangle is mirrored.""" if not _HAS_CYTHON: raise RuntimeError("sem_cython12 not available; import guarded caller") @@ -132,7 +134,7 @@ def pairwise_distances(X) -> np.ndarray: def nn_distances(X) -> np.ndarray: """For each row of ``X`` (shape ``(N, D)``), return the minimum - POSITIVE Euclidean distance to any other row. Rows for which no + POSITIVE distance to any other row. Rows for which no positive-distance neighbour exists receive ``inf`` so the caller can filter them with ``np.isfinite``.""" if not _HAS_CYTHON: @@ -141,13 +143,14 @@ def nn_distances(X) -> np.ndarray: def extend_frontier_kernel(cur_centers, cur_radii, new_emb, cur_arity: int): - """Fused (centroid + radius) bulk computation. For each pair - ``(f, a)`` with ``f`` in ``range(F)`` and ``a`` in ``range(A)``: - - new_center = (cur_arity * cur_centers[f] + new_emb[a]) / (cur_arity + 1) - norm_new = || new_emb[a] - new_center || - new_radius = max(1e-12, - (cur_radii[f] * cur_arity + norm_new) / (cur_arity + 1)) + """Fused bulk update for an incremental aggregation step. Given + ``F`` reference items - each summarised by a centre vector + (``cur_centers``, shape ``(F, D)``) and a radius + (``cur_radii``, shape ``(F,)``) representing the dispersion of + ``cur_arity`` underlying points - and ``A`` candidate new + contributions (``new_emb``, shape ``(A, D)``), produce all + ``F * A`` updated (centre, radius) pairs that result from + appending one candidate to one reference item. Returns ``(flat_new_centers, flat_radii)`` with shapes ``((F*A, D), (F*A,))`` packed in row-major (f, a) order.