Source code for hplc_analysis

"""
HPLC Chromatogram Analysis Module

This module provides functions for analyzing HPLC chromatogram data, including
peak detection and basic chromatographic parameter calculations.

eLabFTW Integration
-------------------
This analysis script is designed to work with chromatogram data exported from
laboratory instruments. Link your analysis to experimental records in eLabFTW:

* **Equipment Records**: Document instrument specifications and maintenance
  (e.g., https://your-elabftw-instance.org/database.php?mode=view&id=EQUIP-12345)
* **Experiment Records**: Reference the specific experimental run
  (e.g., https://your-elabftw-instance.org/experiments.php?mode=view&id=67890)

Example eLabFTW Reference in Code
----------------------------------
Always include eLabFTW links in your data files and analysis scripts:

.. code-block:: python

    # Link to eLabFTW experiment in file header or docstring
    # Experiment: https://your-elabftw-instance.org/experiments.php?mode=view&id=67890
    # Equipment: https://your-elabftw-instance.org/database.php?mode=view&id=EQUIP-12345

For more information on eLabFTW integration, see the documentation.
"""

import numpy as np
from typing import Tuple, List, Dict
from pathlib import Path



[docs]
def load_chromatogram(filepath: str) -> Tuple[np.ndarray, np.ndarray]:
    """
    Load HPLC chromatogram data from a text file.
    
    Reads a two-column text file containing time and absorbance data.
    Lines starting with '#' are treated as comments and skipped.
    
    Parameters
    ----------
    filepath : str
        Path to the chromatogram data file. File should contain two columns:
        time (minutes) and absorbance (mAU).
        
    Returns
    -------
    time : np.ndarray
        Time values in minutes (1D array).
    absorbance : np.ndarray
        Absorbance values in mAU (1D array).
        
    Raises
    ------
    FileNotFoundError
        If the specified file does not exist.
    ValueError
        If the file format is invalid or cannot be parsed.
        
    Examples
    --------
    >>> time, absorbance = load_chromatogram('data/sample_hplc_chromatogram.txt')
    >>> print(f"Data points: {len(time)}")
    Data points: 101
    >>> print(f"Time range: {time[0]:.2f} - {time[-1]:.2f} min")
    Time range: 0.00 - 10.00 min
    
    Notes
    -----
    **eLabFTW Best Practice**: Include the eLabFTW experiment ID in the
    chromatogram file header as a comment. This creates a permanent link
    between your raw data and experimental documentation.
    
    Example file header::
    
        # Experiment: eLabFTW #67890
        # Equipment: HPLC-UV (eLabFTW ID: EQUIP-12345)
        # Time(min)    Absorbance(mAU)
        0.00    2.1
        ...
    
    See Also
    --------
    find_peaks : Detect peaks in the loaded chromatogram
    """
    filepath_obj = Path(filepath)
    if not filepath_obj.exists():
        raise FileNotFoundError(f"Chromatogram file not found: {filepath}")
    
    # Read data, skipping comment lines
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith('#'):
                parts = line.split()
                if len(parts) >= 2:
                    try:
                        time_val = float(parts[0])
                        abs_val = float(parts[1])
                        data.append([time_val, abs_val])
                    except ValueError:
                        continue  # Skip lines that can't be converted to float
    
    if not data:
        raise ValueError(f"No valid data found in {filepath}")
    
    data_array = np.array(data)
    return data_array[:, 0], data_array[:, 1]




[docs]
def find_peaks(time: np.ndarray, absorbance: np.ndarray, 
               threshold: float = 10.0, min_distance: int = 5) -> List[Dict[str, float]]:
    """
    Detect peaks in HPLC chromatogram data.
    
    Identifies local maxima in the absorbance signal that exceed a specified
    threshold. Returns peak properties including retention time, height, and
    approximate area.
    
    Parameters
    ----------
    time : np.ndarray
        Time values in minutes (1D array).
    absorbance : np.ndarray
        Absorbance values in mAU (1D array).
    threshold : float, optional
        Minimum peak height in mAU to be considered a peak (default: 10.0).
    min_distance : int, optional
        Minimum number of data points between peaks (default: 5).
        
    Returns
    -------
    peaks : list of dict
        List of detected peaks, where each peak is a dictionary containing:
        
        * 'retention_time': float - Peak retention time in minutes
        * 'height': float - Peak height in mAU
        * 'area': float - Approximate peak area (trapezoidal integration)
        * 'index': int - Index of peak maximum in the data array
        
    Examples
    --------
    >>> time, absorbance = load_chromatogram('data/sample_hplc_chromatogram.txt')
    >>> peaks = find_peaks(time, absorbance, threshold=50.0)
    >>> for i, peak in enumerate(peaks, 1):
    ...     print(f"Peak {i}: RT={peak['retention_time']:.2f} min, "
    ...           f"Height={peak['height']:.1f} mAU")
    Peak 1: RT=2.10 min, Height=98.7 mAU
    Peak 2: RT=5.70 min, Height=122.3 mAU
    
    Notes
    -----
    This is a simple peak detection algorithm suitable for well-resolved peaks.
    For complex chromatograms with overlapping peaks, consider using more
    sophisticated peak deconvolution methods.
    
    The peak area is calculated using trapezoidal integration from the point
    where absorbance drops below the threshold on either side of the peak.
    
    **Documentation in eLabFTW**: When documenting peak identification results,
    include them in your eLabFTW experiment record along with:
    
    * Integration parameters (threshold, baseline correction)
    * Peak assignments (compound identities)
    * Calibration curve information for quantification
    
    This ensures all analysis parameters are tracked with your experimental data.
    
    See Also
    --------
    load_chromatogram : Load chromatogram data from file
    calculate_resolution : Calculate peak resolution
    """
    peaks = []
    n = len(absorbance)
    
    # Find local maxima
    for i in range(1, n - 1):
        # Check if this point is a local maximum
        if (absorbance[i] > absorbance[i-1] and 
            absorbance[i] > absorbance[i+1] and 
            absorbance[i] > threshold):
            
            # Check minimum distance from previous peaks
            if peaks and (i - peaks[-1]['index']) < min_distance:
                # If new peak is higher, replace previous peak
                if absorbance[i] > peaks[-1]['height']:
                    peaks[-1] = {
                        'retention_time': time[i],
                        'height': absorbance[i],
                        'index': i
                    }
                continue
            
            # Calculate approximate peak area (simple integration)
            # Find peak start and end (where signal drops below threshold)
            start_idx = i
            while start_idx > 0 and absorbance[start_idx] > threshold * 0.1:
                start_idx -= 1
            
            end_idx = i
            while end_idx < n - 1 and absorbance[end_idx] > threshold * 0.1:
                end_idx += 1
            
            # Trapezoidal integration for area
            peak_area = np.trapezoid(absorbance[start_idx:end_idx+1], 
                                     time[start_idx:end_idx+1])
            
            peaks.append({
                'retention_time': time[i],
                'height': absorbance[i],
                'area': peak_area,
                'index': i
            })
    
    return peaks




[docs]
def calculate_resolution(peak1: Dict[str, float], peak2: Dict[str, float], 
                        time: np.ndarray, absorbance: np.ndarray) -> float:
    """
    Calculate chromatographic resolution between two peaks.
    
    Resolution (Rs) is a measure of peak separation, defined as:
    
    .. math::
        R_s = \\frac{2(t_{R2} - t_{R1})}{w_1 + w_2}
    
    where :math:`t_{R1}` and :math:`t_{R2}` are retention times, and
    :math:`w_1` and :math:`w_2` are peak widths at baseline.
    
    Parameters
    ----------
    peak1 : dict
        First peak dictionary from find_peaks().
    peak2 : dict
        Second peak dictionary from find_peaks().
    time : np.ndarray
        Time values in minutes.
    absorbance : np.ndarray
        Absorbance values in mAU.
        
    Returns
    -------
    resolution : float
        Resolution value. Rs > 1.5 indicates baseline separation.
        
    Examples
    --------
    >>> time, absorbance = load_chromatogram('data/sample_hplc_chromatogram.txt')
    >>> peaks = find_peaks(time, absorbance, threshold=50.0)
    >>> if len(peaks) >= 2:
    ...     rs = calculate_resolution(peaks[0], peaks[1], time, absorbance)
    ...     print(f"Resolution: {rs:.2f}")
    Resolution: 4.23
    
    Notes
    -----
    This implementation estimates peak width at half height (FWHH) and converts
    to baseline width using the approximation: baseline_width ≈ 2 * FWHH.
    
    **Quality Control**: Document resolution values in eLabFTW for method
    validation and quality control. Include acceptance criteria (typically
    Rs > 1.5 for baseline separation).
    
    References
    ----------
    .. [1] Snyder, L. R., Kirkland, J. J., & Dolan, J. W. (2010).
           Introduction to Modern Liquid Chromatography (3rd ed.). Wiley.
    """
    # Calculate retention time difference
    rt_diff = abs(peak2['retention_time'] - peak1['retention_time'])
    
    # Estimate peak widths at half height
    def estimate_peak_width(peak_dict):
        idx = peak_dict['index']
        half_height = peak_dict['height'] / 2.0
        
        # Find left half-height point
        left_idx = idx
        while left_idx > 0 and absorbance[left_idx] > half_height:
            left_idx -= 1
        
        # Find right half-height point
        right_idx = idx
        n = len(absorbance)
        while right_idx < n - 1 and absorbance[right_idx] > half_height:
            right_idx += 1
        
        # Width at half height
        fwhh = time[right_idx] - time[left_idx]
        # Approximate baseline width (4σ for Gaussian peak)
        return 2.0 * fwhh
    
    w1 = estimate_peak_width(peak1)
    w2 = estimate_peak_width(peak2)
    
    # Calculate resolution
    resolution = 2.0 * rt_diff / (w1 + w2)
    return resolution




[docs]
def analyze_chromatogram(filepath: str, threshold: float = 10.0) -> Dict:
    """
    Complete analysis workflow for an HPLC chromatogram.
    
    Loads data, detects peaks, and calculates chromatographic parameters.
    Provides a comprehensive summary suitable for reporting in laboratory
    notebooks or publications.
    
    Parameters
    ----------
    filepath : str
        Path to chromatogram data file.
    threshold : float, optional
        Peak detection threshold in mAU (default: 10.0).
        
    Returns
    -------
    results : dict
        Analysis results containing:
        
        * 'filepath': str - Input file path
        * 'n_points': int - Number of data points
        * 'time_range': tuple - (min_time, max_time) in minutes
        * 'peaks': list - Detected peaks with properties
        * 'n_peaks': int - Number of detected peaks
        * 'baseline': float - Estimated baseline absorbance
        
    Examples
    --------
    >>> results = analyze_chromatogram('data/sample_hplc_chromatogram.txt', 
    ...                                threshold=50.0)
    >>> print(f"Detected {results['n_peaks']} peaks")
    Detected 2 peaks
    >>> for i, peak in enumerate(results['peaks'], 1):
    ...     print(f"Peak {i}: RT={peak['retention_time']:.2f} min")
    Peak 1: RT=2.10 min
    Peak 2: RT=5.70 min
    
    Notes
    -----
    **Complete Workflow with eLabFTW**:
    
    1. **Before Analysis**:
       
       * Create experiment record in eLabFTW with method details
       * Record instrument ID and calibration information
       * Upload raw data file to eLabFTW
    
    2. **During Analysis**:
       
       * Run this analysis function
       * Reference eLabFTW experiment ID in analysis script header
    
    3. **After Analysis**:
       
       * Export results to CSV or JSON
       * Upload results to eLabFTW experiment record
       * Link GitHub repository/commit in eLabFTW for code traceability
       * Document any manual peak assignments or corrections
    
    This workflow ensures complete traceability from raw data to final results.
    
    See Also
    --------
    load_chromatogram : Load data file
    find_peaks : Peak detection algorithm
    """
    # Load data
    time, absorbance = load_chromatogram(filepath)
    
    # Detect peaks
    peaks = find_peaks(time, absorbance, threshold=threshold)
    
    # Calculate baseline (median of first 10% of data)
    n_baseline = max(1, len(absorbance) // 10)
    baseline = np.median(absorbance[:n_baseline])
    
    # Compile results
    results = {
        'filepath': filepath,
        'n_points': len(time),
        'time_range': (float(time[0]), float(time[-1])),
        'peaks': peaks,
        'n_peaks': len(peaks),
        'baseline': float(baseline)
    }
    
    return results



if __name__ == '__main__':
    # Example usage - analyze the sample chromatogram
    # This demonstrates the complete workflow for HPLC data analysis
    
    # Reference to eLabFTW (replace with your instance URLs)
    print("HPLC Chromatogram Analysis")
    print("=" * 50)
    print("eLabFTW Experiment: https://your-elabftw-instance.org/experiments.php?mode=view&id=67890")
    print("eLabFTW Equipment: https://your-elabftw-instance.org/database.php?mode=view&id=EQUIP-12345")
    print()
    
    # Analyze chromatogram
    data_file = 'data/sample_hplc_chromatogram.txt'
    results = analyze_chromatogram(data_file, threshold=50.0)
    
    # Print summary
    print(f"File: {results['filepath']}")
    print(f"Data points: {results['n_points']}")
    print(f"Time range: {results['time_range'][0]:.2f} - {results['time_range'][1]:.2f} min")
    print(f"Baseline: {results['baseline']:.2f} mAU")
    print(f"\nDetected {results['n_peaks']} peaks:")
    print("-" * 50)
    
    for i, peak in enumerate(results['peaks'], 1):
        print(f"\nPeak {i}:")
        print(f"  Retention Time: {peak['retention_time']:.2f} min")
        print(f"  Height: {peak['height']:.1f} mAU")
        print(f"  Area: {peak['area']:.1f} mAU·min")
    
    # Calculate resolution if multiple peaks found
    if results['n_peaks'] >= 2:
        print("\nPeak Resolution:")
        print("-" * 50)
        time, absorbance = load_chromatogram(data_file)
        for i in range(len(results['peaks']) - 1):
            rs = calculate_resolution(results['peaks'][i], results['peaks'][i+1], 
                                     time, absorbance)
            print(f"Peak {i+1} - Peak {i+2}: Rs = {rs:.2f}")
    
    print("\n" + "=" * 50)
    print("Analysis complete. Document results in eLabFTW experiment record.")