Source code for handposeutils.embeddings.vector

# vector.py
# encoder for geometric, latent, and graph-based embeddings

import numpy as np
from math import acos
from typing import List
from handposeutils.data.handpose import HandPose
from handposeutils.data.coordinate import Coordinate
from typing import Callable, Optional, Tuple



[docs]
def get_joint_angle_vector(pose: HandPose) -> np.ndarray:
    """
    Generate a 15-dimensional joint-angle embedding vector from a HandPose.

    Each finger contributes three angles:
    - Two intra-finger joint angles between consecutive bones
    - One base-to-knuckle angle

    The angles are measured in radians.

    Note
    ----
    The HandPose should be normalized in position and scale before embedding

    Parameters
    ----------
    pose : HandPose
        Normalized hand pose from which to compute joint angles.

    Returns
    -------
    np.ndarray
        Array of shape (15,) containing joint angles in radians.
    """

    def compute_angle(a: Coordinate, b: Coordinate, c: Coordinate) -> float:
        """Compute angle at point b formed by (a - b - c) using cosine rule."""
        v1 = np.array([a.x - b.x, a.y - b.y, a.z - b.z])
        v2 = np.array([c.x - b.x, c.y - b.y, c.z - b.z])
        dot = np.dot(v1, v2)
        norm = np.linalg.norm(v1) * np.linalg.norm(v2) + 1e-8
        cos_angle = np.clip(dot / norm, -1.0, 1.0)
        return acos(cos_angle)

    # Define angle triplets (a, b, c)
    triplets: List[tuple[int, int, int]] = [
        # Thumb
        (1, 2, 3), (2, 3, 4), (0, 1, 2),
        # Index
        (5, 6, 7), (6, 7, 8), (0, 5, 6),
        # Middle
        (9, 10, 11), (10, 11, 12), (0, 9, 10),
        # Ring
        (13, 14, 15), (14, 15, 16), (0, 13, 14),
        # Pinky
        (17, 18, 19), (18, 19, 20), (0, 17, 18),
    ]

    angles = []
    for a_idx, b_idx, c_idx in triplets:
        a, b, c = pose[a_idx], pose[b_idx], pose[c_idx]
        angle = compute_angle(a, b, c)
        angles.append(angle)

    return np.array(angles)




[docs]
def get_bone_length_vector(pose: HandPose) -> np.ndarray:
    """
    Compute a 20-dimensional bone length vector from a HandPose.

    Each element corresponds to the Euclidean length of a bone segment
    between two key landmarks.

    Note
    ----
    The HandPose should be normalized in position and scale before embedding
    to ensure consistent scale across samples.

    Parameters
    ----------
    pose : HandPose
        Normalized hand pose from which to compute bone lengths.

    Returns
    -------
    np.ndarray
        Array of shape (20,) representing lengths of each bone segment.
    """
    bone_pairs: List[tuple[int, int]] = [
        # Thumb
        (0, 1), (1, 2), (2, 3), (3, 4),
        # Index
        (0, 5), (5, 6), (6, 7), (7, 8),
        # Middle
        (0, 9), (9, 10), (10, 11), (11, 12),
        # Ring
        (0, 13), (13, 14), (14, 15), (15, 16),
        # Pinky
        (0, 17), (17, 18), (18, 19), (19, 20)
    ]

    lengths = []
    for i, j in bone_pairs:
        coord_i = pose[i]
        coord_j = pose[j]
        dist = np.linalg.norm(np.array(coord_j.as_tuple()) - np.array(coord_i.as_tuple()))
        lengths.append(dist)

    return np.array(lengths)




[docs]
def get_relative_vector_embedding(pose: HandPose) -> np.ndarray:
    """
    Compute a 63-dimensional vector of relative landmark positions to the wrist.

    For each of the 21 landmarks, compute the 3D coordinate offset relative to the wrist (landmark 0),
    then flatten these relative coordinates into a single vector.

    Note
    ----
    The HandPose should be normalized before embedding for consistent comparisons.

    Parameters
    ----------
    pose : HandPose
        Normalized hand pose to embed.

    Returns
    -------
    np.ndarray
        Flattened array of shape (63,), representing relative landmark positions.
    """
    wrist = pose[0]
    relative_coords = []

    for i in range(21):
        pt = pose[i]
        vec = np.array([pt.x - wrist.x, pt.y - wrist.y, pt.z - wrist.z])
        relative_coords.extend(vec)

    return np.array(relative_coords)



[docs]
def get_fused_pose_embedding(pose: HandPose) -> np.ndarray:
    """
    Compute a 98-dimensional fused embedding vector combining:
    - 15D joint angles
    - 20D bone lengths
    - 63D relative landmark positions

    This comprehensive vector captures intrinsic geometric and biomechanical
    qualities of the hand pose.

    Note
    ----
    Normalize the HandPose before calling this function for consistency.

    Parameters
    ----------
    pose : HandPose
        Normalized hand pose to encode.

    Returns
    -------
    np.ndarray
        Concatenated embedding vector of shape (98,).
    """
    angles = get_joint_angle_vector(pose)
    lengths = get_bone_length_vector(pose)
    rel = get_relative_vector_embedding(pose)

    return np.concatenate([angles, lengths, rel])


from handposeutils.data.handpose_sequence import HandPoseSequence

def _sinusoidal_time_encoding(timestamps: np.ndarray, dim: int, time_scale: float = 1.0) -> np.ndarray:
    """
    Compute continuous sinusoidal positional encodings for a sequence of timestamps,
    adapted from Transformer positional encodings.

    Encoding dimension 'dim' determines the size of the embedding vector for each timestamp.
    If 'dim' is odd, the last dimension is a sine component; otherwise, sine and cosine alternate.

    Parameters
    ----------
    timestamps : np.ndarray, shape (T,)
        1D array of time values in seconds.
    dim : int
        Dimensionality of the output encoding vector.
    time_scale : float, optional
        Frequency scaling factor for the encoding (default 1.0).
        Larger values correspond to slower oscillations.

    Returns
    -------
    np.ndarray, shape (T, dim)
        Sinusoidal positional encoding matrix where each row corresponds to a timestamp.
    """
    if timestamps.size == 0:
        return np.zeros((0, dim), dtype=float)

    t = timestamps.astype(np.float64) * time_scale  # scale timestamps
    enc = np.zeros((t.shape[0], dim), dtype=float)

    # Use same formula as transformer: pos / (10000^(2i/dim))
    # But with continuous time t instead of integer position.
    # We compute angles = t / (base ** (2i/dim)), where base is 10000.
    base = 10000.0
    half = dim // 2
    denom = base ** (2.0 * np.arange(half) / float(dim))
    angles = np.outer(t, 1.0 / denom)  # shape (T, half)

    enc[:, 0::2] = np.sin(angles)
    if dim % 2 == 0:
        enc[:, 1::2] = np.cos(angles)
    else:
        # if odd, last column take cos of angles[:, -1] (pad)
        enc[:, 1::2][:, :angles.shape[1]] = np.cos(angles)
        # remaining last column left as zeros or copy a cos; keep zeros to avoid confusion
    return enc


def _compute_velocities(embeddings: np.ndarray, timestamps: Optional[np.ndarray] = None) -> np.ndarray:
    """
    Compute first-order temporal differences (velocities) of per-frame embeddings.

    If timestamps are provided, differences are scaled by the inverse of time intervals:
    velocity at time t = (embedding_t - embedding_{t-1}) / delta_t

    The first row is zero since no prior frame exists.

    Parameters
    ----------
    embeddings : np.ndarray, shape (T, D)
        Sequence of embeddings over time.
    timestamps : Optional[np.ndarray], shape (T,), optional
        Timestamps corresponding to each embedding frame. If None, uniform frame difference assumed.

    Returns
    -------
    np.ndarray, shape (T, D)
        Velocity vectors matching input embedding shape.
    """
    if embeddings.shape[0] == 0:
        return np.zeros_like(embeddings)

    if timestamps is None:
        # simple frame-to-frame diff, prepend zeros
        diffs = np.vstack([np.zeros((1, embeddings.shape[1])), np.diff(embeddings, axis=0)])
        return diffs
    else:
        t = timestamps
        dt = np.diff(t, prepend=t[0])
        dt[dt == 0] = 1.0  # avoid division by zero (if same timestamp, treat as unit time)
        diffs = np.vstack([np.zeros((1, embeddings.shape[1])), np.diff(embeddings, axis=0)])
        # scale by 1/dt per row
        diffs = diffs / dt[:, None]
        return diffs


def _uniform_downsample(array: np.ndarray, target_len: int) -> np.ndarray:
    """
    Uniformly sample rows from a 2D array to achieve a target length.

    If the input array is shorter or equal in length to target_len,
    the original array is returned unchanged.

    Parameters
    ----------
    array : np.ndarray, shape (N, D)
        Input array to downsample along the first dimension.
    target_len : int
        Desired length of the output array.

    Returns
    -------
    np.ndarray, shape (target_len, D) or (N, D)
        Downsampled array with uniformly spaced rows.
    """
    n = array.shape[0]
    if n <= target_len:
        return array.copy()
    indices = np.linspace(0, n - 1, num=target_len, dtype=int)
    return array[indices]


def _pca_reduce(matrix: np.ndarray, n_components: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Helper to perform Principal Component Analysis (PCA) via SVD
    to reduce dimensionality of a data matrix.

    Parameters
    ----------
    matrix : np.ndarray, shape (N, D)
        Input data matrix with N samples and D features.
    n_components : int
        Number of principal components to retain.

    Returns
    -------
    reduced : np.ndarray, shape (N, n_components)
        Data projected onto top principal components.
    components : np.ndarray, shape (n_components, D)
        Principal component vectors.
    mean : np.ndarray, shape (D,)
        Mean of the original data, used for centering.
    """
    if matrix.size == 0:
        return matrix.copy(), np.zeros((0, matrix.shape[1])), np.zeros((matrix.shape[1],))
    # center
    mean = matrix.mean(axis=0)
    M = matrix - mean
    # SVD
    U, S, Vt = np.linalg.svd(M, full_matrices=False)
    components = Vt[:n_components]  # (n_components, D)
    reduced = M @ components.T  # (N, n_components)
    return reduced, components, mean



[docs]
def structured_temporal_embedding(
    sequence: HandPoseSequence,
    pose_embedding_fn: Callable[[object], np.ndarray],
    max_length: Optional[int] = None,
    include_velocity: bool = True,
    time_scale: float = 1.0,
    downsample: Optional[str] = "uniform",  # or None
    pca_components: Optional[int] = None,
    verbose: bool = False
) -> np.ndarray:
    """
    Construct a structured temporal embedding for a HandPoseSequence.

    Each frame is embedded by `pose_embedding_fn`, augmented with
    sinusoidal positional encoding and optional velocity (temporal difference) embeddings.
    Optionally downsamples frames, applies PCA for dimensionality reduction, and pads/truncates to `max_length`.

    Parameters
    ----------
    sequence : HandPoseSequence
        Sequence of timed hand poses.
    pose_embedding_fn : Callable[[HandPose], np.ndarray]
        Function to compute static embedding for each HandPose frame.
    max_length : Optional[int], optional
        Target sequence length for output embeddings. Longer sequences are truncated or downsampled.
        Shorter sequences are zero-padded.
    include_velocity : bool, optional
        Whether to append first-order velocity embeddings (default True).
    time_scale : float, optional
        Scale factor for sinusoidal time encoding frequencies (default 1.0).
    downsample : Optional[str], optional
        Method to downsample frames if sequence exceeds max_length.
        'uniform' uniformly samples frames; None disables downsampling (default 'uniform').
    pca_components : Optional[int], optional
        If set, reduces per-frame embedding dimension to this number using PCA (default None).
    verbose : bool, optional
        If True, prints debug information (default False).

    Returns
    -------
    np.ndarray, shape (T_out, D_out)
        Temporal embedding matrix with T_out = max_length (if specified) or sequence length,
        and D_out = per-frame embedding dimension after augmentation.
    """
    # 1) Validate and extract
    seq_len = len(sequence)
    if seq_len == 0:
        if verbose:
            print("[structured_temporal_embedding] empty sequence -> returning zeros")
        if max_length is None:
            return np.zeros((0, 0), dtype=float)
        else:
            # return zero padded output: T x D; but we don't know D yet; choose pose_embedding_fn on a dummy?
            dummy = pose_embedding_fn(sequence.current_pose) if sequence.current_pose is not None else None
            if dummy is None:
                # cannot infer dimension; return zeros with shape (max_length, 0)
                return np.zeros((max_length, 0), dtype=float)
            pose_dim = dummy.shape[0]
            per_frame_dim = pose_dim * (2 + (1 if include_velocity else 0))  # pose + posenc + velocity
            return np.zeros((max_length, per_frame_dim), dtype=float)

    timestamps = np.array(sequence.get_all_timestamps(), dtype=float)  # (T,)

    # 2) Compute per-frame pose embeddings
    per_frame = []
    for timed in sequence.sequence:
        e = pose_embedding_fn(timed.pose)
        if e is None:
            raise ValueError("pose_embedding_fn returned None for a pose")
        per_frame.append(np.asarray(e, dtype=float))
    per_frame = np.vstack(per_frame)  # (T, D_pose)
    if verbose:
        print(f"[structured] raw per-frame embeddings shape: {per_frame.shape}")

    # 3) Optional downsample/truncate to max_length
    T, D_pose = per_frame.shape
    if max_length is not None and T > max_length and downsample == "uniform":
        per_frame = _uniform_downsample(per_frame, max_length)
        timestamps = timestamps[np.linspace(0, T - 1, num=max_length, dtype=int)]
        T = max_length
        if verbose:
            print(f"[structured] downsampled to {T} frames")

    # 4) Compute positional encodings (same dimensionality as pose embedding)
    pos_enc = _sinusoidal_time_encoding(timestamps, D_pose, time_scale=time_scale)  # (T, D_pose)

    # 5) Compose embedding per frame: pose + posenc
    composed = per_frame + pos_enc  # (T, D_pose)

    # 6) Optionally compute velocities and append
    if include_velocity:
        velocities = _compute_velocities(per_frame, timestamps)  # (T, D_pose)
        composed = np.hstack([composed, velocities])  # (T, 2*D_pose)
        if verbose:
            print(f"[structured] velocities appended; per-frame dim now {composed.shape[1]}")

    # 7) Optionally PCA reduce per-frame dim
    if pca_components is not None and pca_components > 0 and composed.shape[0] > 0:
        reduced, components, mean = _pca_reduce(composed, pca_components)
        composed = reduced  # (T, pca_components)
        if verbose:
            print(f"[structured] PCA reduced per-frame dim to {pca_components}")

    # 8) Padding if seq shorter than max_length
    if max_length is not None and T < max_length:
        pad_count = max_length - T
        pad = np.zeros((pad_count, composed.shape[1]))
        composed = np.vstack([composed, pad])
        if verbose:
            print(f"[structured] padded from {T} to {max_length} frames")

    return composed  # shape (T_out, D_out)




[docs]
def flatten_temporal_embedding(
    sequence: HandPoseSequence,
    pose_embedding_fn: Callable[[object], np.ndarray],
    max_length: Optional[int] = 30,
    include_velocity: bool = True,
    time_scale: float = 1.0,
    downsample: Optional[str] = "uniform",
    pca_components: Optional[int] = None,
    verbose: bool = False
) -> np.ndarray:
    """
    Compute a flattened 1D temporal embedding vector by concatenating all frames
    from the structured temporal embedding.

    This produces a fixed-size vector suitable for ML models requiring fixed-length input.

    Parameters
    ----------
    sequence : HandPoseSequence
        Sequence of timed hand poses.
    pose_embedding_fn : Callable[[HandPose], np.ndarray]
        Function to compute static embedding for each HandPose frame.
    max_length : Optional[int], optional
        Target sequence length for truncation/padding (default 30).
    include_velocity : bool, optional
        Whether to append velocity embeddings (default True).
    time_scale : float, optional
        Frequency scale for positional encoding (default 1.0).
    downsample : Optional[str], optional
        Downsampling method for longer sequences (default 'uniform').
    pca_components : Optional[int], optional
        Dimensionality for PCA reduction (default None).
    verbose : bool, optional
        If True, prints debug information (default False).

    Returns
    -------
    np.ndarray, shape (T_out * D_out,)
        Flattened temporal embedding vector, where T_out is max_length or sequence length,
        and D_out is embedding dimension per frame after augmentation.
    """
    structured = structured_temporal_embedding(
        sequence=sequence,
        pose_embedding_fn=pose_embedding_fn,
        max_length=max_length,
        include_velocity=include_velocity,
        time_scale=time_scale,
        downsample=downsample,
        pca_components=pca_components,
        verbose=verbose
    )
    # Flatten row-major
    flat = structured.flatten()
    return flat