Source code for sample_module

"""
Sample module for demonstrating scientific Python documentation.

This module provides functions for data analysis and statistical computations
commonly used in scientific research.
"""

import numpy as np
from typing import Union, Tuple, Optional



[docs]
def calculate_mean_std(data: np.ndarray) -> Tuple[float, float]:
    """
    Calculate the mean and standard deviation of a dataset.
    
    This function computes the arithmetic mean and population standard deviation
    of the input data using NumPy's efficient implementations.
    
    Parameters
    ----------
    data : np.ndarray
        A 1D numpy array containing numerical data.
        
    Returns
    -------
    mean : float
        The arithmetic mean of the dataset.
    std : float
        The population standard deviation of the dataset.
        
    Examples
    --------
    >>> import numpy as np
    >>> data = np.array([1, 2, 3, 4, 5])
    >>> mean, std = calculate_mean_std(data)
    >>> print(f"Mean: {mean}, Std: {std:.2f}")
    Mean: 3.0, Std: 1.41
    
    Notes
    -----
    The standard deviation is calculated using the population formula (N divisor),
    not the sample formula (N-1 divisor).
    
    See Also
    --------
    numpy.mean : Compute the arithmetic mean.
    numpy.std : Compute the standard deviation.
    """
    mean = np.mean(data)
    std = np.std(data)
    return mean, std




[docs]
def linear_regression(x: np.ndarray, y: np.ndarray) -> Tuple[float, float, float]:
    """
    Perform simple linear regression on two variables.
    
    Fits a linear model y = mx + b to the data using the least squares method.
    Returns the slope, intercept, and coefficient of determination (R²).
    
    Parameters
    ----------
    x : np.ndarray
        Independent variable data (1D array).
    y : np.ndarray
        Dependent variable data (1D array).
        
    Returns
    -------
    slope : float
        The slope (m) of the fitted line.
    intercept : float
        The y-intercept (b) of the fitted line.
    r_squared : float
        The coefficient of determination (R²), indicating goodness of fit.
        Values range from 0 to 1, where 1 indicates perfect fit.
        
    Raises
    ------
    ValueError
        If x and y have different lengths or if x has zero variance.
        
    Examples
    --------
    >>> import numpy as np
    >>> x = np.array([1, 2, 3, 4, 5])
    >>> y = np.array([2, 4, 5, 4, 5])
    >>> slope, intercept, r2 = linear_regression(x, y)
    >>> print(f"y = {slope:.2f}x + {intercept:.2f}, R² = {r2:.3f}")
    y = 0.60x + 2.20, R² = 0.462
    
    Notes
    -----
    The R² value is calculated as:
    
    .. math::
        R^2 = 1 - \\frac{SS_{res}}{SS_{tot}}
        
    where :math:`SS_{res}` is the residual sum of squares and 
    :math:`SS_{tot}` is the total sum of squares.
    
    References
    ----------
    .. [1] Draper, N. R., & Smith, H. (1998). Applied Regression Analysis 
           (3rd ed.). Wiley-Interscience.
    """
    if len(x) != len(y):
        raise ValueError("x and y must have the same length")
    
    if np.std(x) == 0:
        raise ValueError("x must have non-zero variance")
    
    # Calculate slope and intercept
    slope = np.cov(x, y)[0, 1] / np.var(x)
    intercept = np.mean(y) - slope * np.mean(x)
    
    # Calculate R²
    y_pred = slope * x + intercept
    ss_res = np.sum((y - y_pred) ** 2)
    ss_tot = np.sum((y - np.mean(y)) ** 2)
    r_squared = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0
    
    return slope, intercept, r_squared




[docs]
def normalize_data(data: np.ndarray, method: str = 'zscore') -> np.ndarray:
    """
    Normalize data using specified method.
    
    Transforms the input data to a standardized scale for improved
    comparability and analysis.
    
    Parameters
    ----------
    data : np.ndarray
        Input data to be normalized (1D or 2D array).
    method : {'zscore', 'minmax'}, optional
        Normalization method to use (default is 'zscore').
        
        * 'zscore': Standardize to zero mean and unit variance
        * 'minmax': Scale to [0, 1] range
        
    Returns
    -------
    normalized : np.ndarray
        Normalized data with the same shape as input.
        
    Raises
    ------
    ValueError
        If method is not 'zscore' or 'minmax'.
        
    Examples
    --------
    >>> import numpy as np
    >>> data = np.array([1, 2, 3, 4, 5])
    >>> normalized = normalize_data(data, method='zscore')
    >>> print(f"Mean: {np.mean(normalized):.2f}, Std: {np.std(normalized):.2f}")
    Mean: 0.00, Std: 1.00
    
    >>> normalized = normalize_data(data, method='minmax')
    >>> print(f"Min: {np.min(normalized):.2f}, Max: {np.max(normalized):.2f}")
    Min: 0.00, Max: 1.00
    
    Notes
    -----
    Z-score normalization is defined as:
    
    .. math::
        z = \\frac{x - \\mu}{\\sigma}
        
    Min-max normalization is defined as:
    
    .. math::
        x_{norm} = \\frac{x - x_{min}}{x_{max} - x_{min}}
    """
    if method == 'zscore':
        mean = np.mean(data)
        std = np.std(data)
        if std == 0:
            return np.zeros_like(data)
        return (data - mean) / std
    elif method == 'minmax':
        min_val = np.min(data)
        max_val = np.max(data)
        if max_val == min_val:
            return np.zeros_like(data)
        return (data - min_val) / (max_val - min_val)
    else:
        raise ValueError(f"Unknown method '{method}'. Use 'zscore' or 'minmax'.")




[docs]
class DataAnalyzer:
    """
    A class for performing common data analysis operations.
    
    This class provides methods for statistical analysis, data transformation,
    and visualization preparation for scientific datasets.
    
    Parameters
    ----------
    data : np.ndarray
        Input dataset to analyze.
    name : str, optional
        Name identifier for the dataset (default is 'dataset').
        
    Attributes
    ----------
    data : np.ndarray
        The stored dataset.
    name : str
        The dataset name.
    n_samples : int
        Number of samples in the dataset.
        
    Examples
    --------
    >>> import numpy as np
    >>> data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    >>> analyzer = DataAnalyzer(data, name='test_data')
    >>> summary = analyzer.get_summary()
    >>> print(summary['mean'])
    5.5
    
    Notes
    -----
    This class is designed for 1D numerical datasets. For multidimensional
    data, consider reshaping or using specialized tools.
    """
    

[docs]
    def __init__(self, data: np.ndarray, name: str = 'dataset'):
        """Initialize the DataAnalyzer."""
        self.data = data
        self.name = name
        self.n_samples = len(data)

    

[docs]
    def get_summary(self) -> dict:
        """
        Get statistical summary of the dataset.
        
        Returns
        -------
        dict
            Dictionary containing statistical measures:
            
            * 'mean': arithmetic mean
            * 'median': median value
            * 'std': standard deviation
            * 'min': minimum value
            * 'max': maximum value
            * 'q25': 25th percentile
            * 'q75': 75th percentile
            
        Examples
        --------
        >>> import numpy as np
        >>> analyzer = DataAnalyzer(np.array([1, 2, 3, 4, 5]))
        >>> summary = analyzer.get_summary()
        >>> summary['median']
        3.0
        """
        return {
            'mean': np.mean(self.data),
            'median': np.median(self.data),
            'std': np.std(self.data),
            'min': np.min(self.data),
            'max': np.max(self.data),
            'q25': np.percentile(self.data, 25),
            'q75': np.percentile(self.data, 75)
        }

    

[docs]
    def detect_outliers(self, method: str = 'iqr', threshold: float = 1.5) -> np.ndarray:
        """
        Detect outliers in the dataset.
        
        Parameters
        ----------
        method : {'iqr', 'zscore'}, optional
            Method for outlier detection (default is 'iqr').
            
            * 'iqr': Interquartile range method
            * 'zscore': Z-score method
            
        threshold : float, optional
            Threshold for outlier detection (default is 1.5 for IQR, 
            typically 3.0 for z-score).
            
        Returns
        -------
        np.ndarray
            Boolean array indicating outliers (True) and inliers (False).
            
        Examples
        --------
        >>> import numpy as np
        >>> data = np.array([1, 2, 3, 4, 5, 100])  # 100 is an outlier
        >>> analyzer = DataAnalyzer(data)
        >>> outliers = analyzer.detect_outliers(method='iqr')
        >>> print(data[outliers])
        [100]
        
        Notes
        -----
        The IQR method considers values outside of 
        :math:`[Q1 - threshold \\times IQR, Q3 + threshold \\times IQR]`
        as outliers, where :math:`IQR = Q3 - Q1`.
        """
        if method == 'iqr':
            q1 = np.percentile(self.data, 25)
            q3 = np.percentile(self.data, 75)
            iqr = q3 - q1
            lower = q1 - threshold * iqr
            upper = q3 + threshold * iqr
            return (self.data < lower) | (self.data > upper)
        elif method == 'zscore':
            z_scores = np.abs((self.data - np.mean(self.data)) / np.std(self.data))
            return z_scores > threshold
        else:
            raise ValueError(f"Unknown method '{method}'. Use 'iqr' or 'zscore'.")