Source code for sample_module

"""
Sample module for demonstrating scientific Python documentation.

This module provides functions for data analysis and statistical computations
commonly used in scientific research.
"""

import numpy as np
from typing import Union, Tuple, Optional


[docs] def calculate_mean_std(data: np.ndarray) -> Tuple[float, float]: """ Calculate the mean and standard deviation of a dataset. This function computes the arithmetic mean and population standard deviation of the input data using NumPy's efficient implementations. Parameters ---------- data : np.ndarray A 1D numpy array containing numerical data. Returns ------- mean : float The arithmetic mean of the dataset. std : float The population standard deviation of the dataset. Examples -------- >>> import numpy as np >>> data = np.array([1, 2, 3, 4, 5]) >>> mean, std = calculate_mean_std(data) >>> print(f"Mean: {mean}, Std: {std:.2f}") Mean: 3.0, Std: 1.41 Notes ----- The standard deviation is calculated using the population formula (N divisor), not the sample formula (N-1 divisor). See Also -------- numpy.mean : Compute the arithmetic mean. numpy.std : Compute the standard deviation. """ mean = np.mean(data) std = np.std(data) return mean, std
[docs] def linear_regression(x: np.ndarray, y: np.ndarray) -> Tuple[float, float, float]: """ Perform simple linear regression on two variables. Fits a linear model y = mx + b to the data using the least squares method. Returns the slope, intercept, and coefficient of determination (R²). Parameters ---------- x : np.ndarray Independent variable data (1D array). y : np.ndarray Dependent variable data (1D array). Returns ------- slope : float The slope (m) of the fitted line. intercept : float The y-intercept (b) of the fitted line. r_squared : float The coefficient of determination (R²), indicating goodness of fit. Values range from 0 to 1, where 1 indicates perfect fit. Raises ------ ValueError If x and y have different lengths or if x has zero variance. Examples -------- >>> import numpy as np >>> x = np.array([1, 2, 3, 4, 5]) >>> y = np.array([2, 4, 5, 4, 5]) >>> slope, intercept, r2 = linear_regression(x, y) >>> print(f"y = {slope:.2f}x + {intercept:.2f}, R² = {r2:.3f}") y = 0.60x + 2.20, R² = 0.462 Notes ----- The R² value is calculated as: .. math:: R^2 = 1 - \\frac{SS_{res}}{SS_{tot}} where :math:`SS_{res}` is the residual sum of squares and :math:`SS_{tot}` is the total sum of squares. References ---------- .. [1] Draper, N. R., & Smith, H. (1998). Applied Regression Analysis (3rd ed.). Wiley-Interscience. """ if len(x) != len(y): raise ValueError("x and y must have the same length") if np.std(x) == 0: raise ValueError("x must have non-zero variance") # Calculate slope and intercept slope = np.cov(x, y)[0, 1] / np.var(x) intercept = np.mean(y) - slope * np.mean(x) # Calculate R² y_pred = slope * x + intercept ss_res = np.sum((y - y_pred) ** 2) ss_tot = np.sum((y - np.mean(y)) ** 2) r_squared = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0 return slope, intercept, r_squared
[docs] def normalize_data(data: np.ndarray, method: str = 'zscore') -> np.ndarray: """ Normalize data using specified method. Transforms the input data to a standardized scale for improved comparability and analysis. Parameters ---------- data : np.ndarray Input data to be normalized (1D or 2D array). method : {'zscore', 'minmax'}, optional Normalization method to use (default is 'zscore'). * 'zscore': Standardize to zero mean and unit variance * 'minmax': Scale to [0, 1] range Returns ------- normalized : np.ndarray Normalized data with the same shape as input. Raises ------ ValueError If method is not 'zscore' or 'minmax'. Examples -------- >>> import numpy as np >>> data = np.array([1, 2, 3, 4, 5]) >>> normalized = normalize_data(data, method='zscore') >>> print(f"Mean: {np.mean(normalized):.2f}, Std: {np.std(normalized):.2f}") Mean: 0.00, Std: 1.00 >>> normalized = normalize_data(data, method='minmax') >>> print(f"Min: {np.min(normalized):.2f}, Max: {np.max(normalized):.2f}") Min: 0.00, Max: 1.00 Notes ----- Z-score normalization is defined as: .. math:: z = \\frac{x - \\mu}{\\sigma} Min-max normalization is defined as: .. math:: x_{norm} = \\frac{x - x_{min}}{x_{max} - x_{min}} """ if method == 'zscore': mean = np.mean(data) std = np.std(data) if std == 0: return np.zeros_like(data) return (data - mean) / std elif method == 'minmax': min_val = np.min(data) max_val = np.max(data) if max_val == min_val: return np.zeros_like(data) return (data - min_val) / (max_val - min_val) else: raise ValueError(f"Unknown method '{method}'. Use 'zscore' or 'minmax'.")
[docs] class DataAnalyzer: """ A class for performing common data analysis operations. This class provides methods for statistical analysis, data transformation, and visualization preparation for scientific datasets. Parameters ---------- data : np.ndarray Input dataset to analyze. name : str, optional Name identifier for the dataset (default is 'dataset'). Attributes ---------- data : np.ndarray The stored dataset. name : str The dataset name. n_samples : int Number of samples in the dataset. Examples -------- >>> import numpy as np >>> data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) >>> analyzer = DataAnalyzer(data, name='test_data') >>> summary = analyzer.get_summary() >>> print(summary['mean']) 5.5 Notes ----- This class is designed for 1D numerical datasets. For multidimensional data, consider reshaping or using specialized tools. """
[docs] def __init__(self, data: np.ndarray, name: str = 'dataset'): """Initialize the DataAnalyzer.""" self.data = data self.name = name self.n_samples = len(data)
[docs] def get_summary(self) -> dict: """ Get statistical summary of the dataset. Returns ------- dict Dictionary containing statistical measures: * 'mean': arithmetic mean * 'median': median value * 'std': standard deviation * 'min': minimum value * 'max': maximum value * 'q25': 25th percentile * 'q75': 75th percentile Examples -------- >>> import numpy as np >>> analyzer = DataAnalyzer(np.array([1, 2, 3, 4, 5])) >>> summary = analyzer.get_summary() >>> summary['median'] 3.0 """ return { 'mean': np.mean(self.data), 'median': np.median(self.data), 'std': np.std(self.data), 'min': np.min(self.data), 'max': np.max(self.data), 'q25': np.percentile(self.data, 25), 'q75': np.percentile(self.data, 75) }
[docs] def detect_outliers(self, method: str = 'iqr', threshold: float = 1.5) -> np.ndarray: """ Detect outliers in the dataset. Parameters ---------- method : {'iqr', 'zscore'}, optional Method for outlier detection (default is 'iqr'). * 'iqr': Interquartile range method * 'zscore': Z-score method threshold : float, optional Threshold for outlier detection (default is 1.5 for IQR, typically 3.0 for z-score). Returns ------- np.ndarray Boolean array indicating outliers (True) and inliers (False). Examples -------- >>> import numpy as np >>> data = np.array([1, 2, 3, 4, 5, 100]) # 100 is an outlier >>> analyzer = DataAnalyzer(data) >>> outliers = analyzer.detect_outliers(method='iqr') >>> print(data[outliers]) [100] Notes ----- The IQR method considers values outside of :math:`[Q1 - threshold \\times IQR, Q3 + threshold \\times IQR]` as outliers, where :math:`IQR = Q3 - Q1`. """ if method == 'iqr': q1 = np.percentile(self.data, 25) q3 = np.percentile(self.data, 75) iqr = q3 - q1 lower = q1 - threshold * iqr upper = q3 + threshold * iqr return (self.data < lower) | (self.data > upper) elif method == 'zscore': z_scores = np.abs((self.data - np.mean(self.data)) / np.std(self.data)) return z_scores > threshold else: raise ValueError(f"Unknown method '{method}'. Use 'iqr' or 'zscore'.")