Source code for sensortoolkit.calculate._regression_stats

# -*- coding: utf-8 -*-
"""
This module computes statistical metrics for the Ordinary Least Squares linear
regression between sensor and FRM/FEM datasets (FRM/FEM as the dependent
variable along the x-axis and sensor data as the independent variable along
the y-axis).

Bias and linearity
------------------

U.S. EPA's Performance Targets Reports recommend using a linear regression
model, relating sensor and FRM/FEM measurements, to determine the magnitude
of bias and linearity. The regression model takes the form

.. math::

    y = mx + b

where

    :math:`y` = sensor measurements

    :math:`x` = FRM/FEM measurements

    :math:`m` = regression slope

    :math:`y` = regression intercept


The slope :math:`m` and intercept :math:`y` indicate the degree of systematic
bias between sensor and reference measurements.

Linearity is measured via the coefficient of determination (:math:`R^2`).

===============================================================================

@Author:
  | Samuel Frederick, NSSC Contractor (ORAU)
  | U.S. EPA / ORD / CEMM / AMCD / SFSB

Created:
  Tue Mar 3 13:47:32 2020
Last Updated:
  Tue Jul 13 11:09:13 2021
"""
import os
import sys
import numpy as np
import pandas as pd
from sensortoolkit.datetime_utils import get_todays_date
from sensortoolkit.lib_utils import check_type


[docs]def regression_stats(sensor_df_obj, ref_df_obj, deploy_dict, param, serials, verbose=True): """Compute OLS regression statistics. Module is used to compute the following regressions: - Sensor vs. FRM/FEM - Sensor vs. Inter-sensor average For each instance, the dependent and independent variables are assigned as **hourly/daily sensor data vs. hourly/daily reference data**; please note the ``ref_df_obj`` object can be either a DataFrame containg FRM/FEM concentratons, or a DataFrame containing intersensor averages depending on the use case. The 'ref' label refers moreso to the fact that the dataset is used as the independent variable for regressions. .. note:: The DataFrames within the ``sensor_df_obj`` and ``ref_df_obj`` arguments should contain data reported at the same sampling frequency (e.g., if a sensor DataFrame containing data at 1-hour averaged intervals is passed to the ``sensor_df_obj``, the reference DataFrame passed to ``ref_df_obj`` must also contain data at 1-hour averaged intervals). Args: sensor_df_obj (pandas DataFrame or list of pandas DataFrames): Either a DataFrame or list of DataFrames containg sensor parameter measurements. Data corresponding to passed parameter name are used as the dependent variable. ref_df_obj (pandas DataFrame): Reference DataFrame (either FRM/FEM OR Inter-sensor averages). Data corresponding to passed parameter name are used as the independent variable. deploy_dict (dict): A dictionary containing descriptive statistics and textual information about the deployment (testing agency, site, time period, etc.), sensors tested, and site conditions during the evaluation. param (str): Parameter name for which to compute regression statistics. serials (dict): A dictionary of sensor serial identifiers for each unit in a testing group. verbose (bool): If true, print statements will be displayed in the console output. Defaults to True. Returns: stats_df (pandas DataFrame): Statistics DataFrame for either sensor vs. FRM/FEM or sensor vs. intersensor mean OLS regression. """ stats_cols = ['Sensor Name', 'Sensor_Number', 'Sensor_Serial', 'Averaging Interval', 'Param', 'Reference', 'R$^2$', 'Slope', 'Intercept', 'Sensor RMSE', 'N', 'Sensor_Min', 'Sensor_Max', 'Sensor_Mean'] stats_df = pd.DataFrame(columns=stats_cols) sensor_name = deploy_dict['Sensor Name'] sensor_nums = list(serials.keys()) sensor_dtype = check_type(sensor_df_obj, accept_types=[pd.core.frame.DataFrame, list]) check_type(ref_df_obj, accept_types=[pd.core.frame.DataFrame]) if sensor_dtype is pd.core.frame.DataFrame: df_list = [sensor_df_obj] if sensor_dtype is list: df_list = sensor_df_obj # Loop over dataframes for sensors, compute regression stats for i, (df, sensor_num) in enumerate(zip(df_list, sensor_nums)): ydata = df[param + '_Value'] if 'mean_' + param + '_Value' in ref_df_obj: xdata = ref_df_obj['mean_' + param + '_Value'] ref_name = 'Intersensor_mean_' + param else: xdata = ref_df_obj[param + '_Value'] ref_name = ''.join( list(ref_df_obj[param + '_Method'].dropna().apply( lambda x: str(x)).unique()[0])) # Locate the union of non-nan sensor (y) and reference (x) data # False if either is nan and true if both are finite # idx = np.isfinite(xdata) & np.isfinite(ydata) # X = pd.Series(xdata) # Y = pd.Series(ydata) combine_df = pd.DataFrame() combine_df['ref_data'] = xdata combine_df['sensor_data'] = ydata combine_df = combine_df.dropna() # Determine the averaging interval for the supplied dataset if (df.index[1] - df.index[0]) == pd.Timedelta('1 days'): avg_int = '24-hour' if (df.index[1] - df.index[0]) == pd.Timedelta('1 hour'): avg_int = '1-hour' if i == 0 and verbose: print('Computing {0} regression statistics for {1} vs.' ' {2}'.format(avg_int, sensor_name, ref_name)) if verbose: print('..{0}'.format(list(serials.values())[i])) if combine_df.empty: print('..Warning: Linear regression not possible for sensor ' + str(sensor_num) + '. Sensor or reference data are ' 'null.') r_square = slope = intercept = rmse = N = np.nan param_min = param_max = param_mean = np.nan else: N = int(len(combine_df)) if N <= 2: print('..Warning: Linear regression not possible for' + ' sensor ' + str(sensor_num) + '. Insufficient number of data points') r_square = slope = intercept = rmse = N = np.nan param_min = param_max = param_mean = np.nan else: # Compute linear regress. for the union of finite x and y data fit = np.polyfit(combine_df.ref_data, combine_df.sensor_data, 1) slope = fit[0] intercept = fit[1] pearson_coeff = combine_df.ref_data.corr(combine_df.sensor_data, method='pearson') r_square = pearson_coeff*pearson_coeff sensor_val = combine_df.sensor_data ref_val = combine_df.ref_data MSE = (1/N)*sum((sensor_val-ref_val)**2) rmse = np.sqrt(MSE) param_min = combine_df.sensor_data.min() param_max = combine_df.sensor_data.max() param_mean = combine_df.sensor_data.mean() serial_id = serials[str(sensor_num)] row_df = pd.DataFrame([sensor_name, sensor_num, serial_id, avg_int, param, ref_name, r_square, slope, intercept, rmse, N, param_min, param_max, param_mean]) row_df.index = stats_df.columns row_df = row_df.T stats_df = pd.concat([stats_df, row_df]).reset_index(drop=True) # Add metric averages if multiple sensor datasets passed to function if sensor_dtype is list: metric_list = ['R$^2$', 'Slope', 'Intercept', 'Sensor RMSE', 'N', 'Sensor_Min', 'Sensor_Max', 'Sensor_Mean'] avg_data = stats_df.where(stats_df['Averaging Interval'] == avg_int ).dropna(how='all', axis=0) avg_df = pd.DataFrame(np.nan, columns=stats_cols, index=[0]) avg_df['Reference'] = 'Metric Average:' for col_i, metric in enumerate(metric_list, 6): metric_data = avg_data.iloc[:, col_i] metric_mean = metric_data.mean() avg_df.iloc[0, col_i] = metric_mean stats_df = pd.concat([stats_df, avg_df]).reset_index(drop=True) stats_df[stats_df.columns[6:]] = stats_df[stats_df.columns[6:] ].astype('float64') return stats_df
[docs]def join_stats(hourly_stats, daily_stats, write_to_file=False, stats_path=None, stats_type='individual'): """Combine 1-hour and 24-hour regression statistics DataFrames. Args: hourly_stats (pandas DataFrame): DataFrame containing 1-hour averaged sensor vs. reference regression statistics, returned by call to regression_stats(). daily_stats (pandas DataFrame): DataFrame containing 24-hour averaged sensor vs. reference regression statistics, returned by call to regression_stats(). write_to_file (bool, optional): If True, the merged statistics dataframe will be written to a csv file. Defaults to False. stats_path (str, optional): The full path to the statistics directory where the dataset will be saved if ``write_to_file`` is True. Defaults to None. stats_type (str, optional): The type of regression statistics dataframes that are being joined ('individual' indicates individual sensor vs. reference regression, 'average' indicates sensor vs. intersensor average regression statistics) Returns: stats_df (pandas DataFrame): DataFrame containing both 1-hour and 24-hour averaged statistics. """ if stats_type not in ['average', 'individual']: raise ValueError(f'Invalid value {stats_type} passed to stats_type. ' 'Choose from either "individual" or "average"') if stats_type == 'individual': stats_type = 'stats_df' if stats_type == 'average': stats_type = 'avg_stats_df' stats_df = hourly_stats.append(daily_stats) ref_name = stats_df.Reference[stats_df.Reference != 'Metric Average:' ].value_counts().keys()[0] sensor_name = stats_df['Sensor Name'].value_counts().keys()[0] param = stats_df['Param'].value_counts().keys()[0] # Save the statistics DataFrame if write_to_file is True: if stats_path is None: sys.exit('No directory path specified for saving DataFrame') today = get_todays_date() ref_name = ref_name.replace(' ', '_') filename = '_'.join([sensor_name, param, 'vs', ref_name, stats_type, today]) filename += '.csv' dataframe_to_csv(stats_df, parent_path=stats_path, filename=filename) return stats_df
[docs]def dataframe_to_csv(obj, parent_path, filename, **kwargs): """Save a pandas DataFrame to a comma-separated value file. Args: obj (pandas DataFrame): The DataFrame to write to a .csv file. parent_path (str): The path to the folder where the file will be saved. filename (str): The name of the resulting .csv file. **kwargs (dict): Keyword arguments passed to the pandas ``to_csv()`` method. Returns: None. """ check_type(obj, accept_types=[pd.core.frame.DataFrame]) filepath = os.path.join(parent_path, filename) print('..Saving dataset to the following path: {0}'.format(filepath)) obj.to_csv(filepath, index=kwargs.get('index', False))