# -*- coding: utf-8 -*-
"""
Top-level analysis module for the ``sensortoolkit`` library.
Contains the front-facing ``SensorEvaluation`` class for conducting analysis
of sensor data.
===============================================================================
@Author:
| Samuel Frederick, NSSC Contractor (ORAU)
| U.S. EPA / ORD / CEMM / AMCD / SFSB
Created:
Fri Jul 31 08:39:37 2020
Last Updated:
Wed Jul 7 15:01:00 2021
"""
import math
import json
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sensortoolkit.calculate
import sensortoolkit.datetime_utils
import sensortoolkit.deploy
import sensortoolkit.lib_utils
import sensortoolkit.model
import sensortoolkit.param
import sensortoolkit.plotting
import sensortoolkit.qc
import sensortoolkit.reference
import sensortoolkit.ingest
from sensortoolkit import presets as _presets
[docs]class SensorEvaluation:
"""Evaluate air sensor performance for use in NSIM applications.
A class for conducting analysis for air sensors deployed at ambient,
outdoor, fixed monitoring sites using U.S. EPA's performance metrics and
targets for sensors measuring PM2.5 or O3. U.S. EPA's testing protocols and
performance metrics are intended for use with devices deployed for
non-regulatory supplemental and informational monitoring (NSIM)
applications.
Args:
sensor (sensortoolkit.AirSensor object):
The air sensor object containing datasets with parameter
measurements that will be evaluated.
param (sensortoolkit.Parameter object):
The parameter (measured environmental quantity) object containing
parameter-specific attributes as well as metrics and targets for
evaluating sensor performance.
reference (sensortoolkit.ReferenceMethod object):
The FRM/FEM reference instrument object containing datasets with
parameter measurements against which air sensor data will be
evaluated.
write_to_file (bool):
If true, evaluation statistics will be written to the
``/data/eval_stats`` sensor subdirectory. Figures will also be
written to the appropriate figures subdirectory.
**kwargs:
Keyword arguments (currently unused).
Attributes:
path (str): The project path in which data, figures, and reports
relevant to the sensor evaluation are stored.
serials (dict): A dictionary of sensor serial identifiers for each
unit in the base testing deployment.
figure_path (str): The full directory path to figures for a given
sensor make and model.
stats_path: The full directory path to evaluation statistics for a
given sensor make and model.
full_df_list (list of pandas DataFrames): List of sensor data frames
of length N (where N is the number of sensor units in a testing
group). DataFrames indexed by ``DateTime`` at recorded sampling
frequency.
hourly_df_list (list of pandas DataFrames): List of sensor data frames
of length N (where N is the number of sensor units in a testing
group). DataFrames indexed by ``DateTime`` at 1-hour averaged
sampling frequency.
daily_df_list (list of pandas DataFrames): List of sensor data frames
of length N (where N is the number of sensor units in a testing
group). DataFrames indexed by ``DateTime`` at 24-hour averaged
sampling frequency.
deploy_period_df (pandas DataFrame): A data frame containing the start
time (‘Begin’), end time (‘End’), and total duration of evaluation
period for each sensor in a deployment group.
deploy_dict (dict): A dictionary containing descriptive statistics and
textual information about the deployment (testing agency, site,
time period, etc.), sensors tested, and site conditions during the
evaluation.
deploy_bdate (pandas timestamp object): Overall start date of
deployment. Determined by selecting the earliest recorded timestamp
in sensor data frames.
deploy_edate (pandas timestamp object): Overall end date of deployment.
Determined by selecting the latest recorded timestamp in sensor
data frames.
ref_dict (dict):
A dictionary container for reference data objects at varying
averaging intervals and parameter classifications.
hourly_ref_df (pandas DataFrame):
Dataset containing reference data at 1-hour averaging intervals
for methods measuring parameters matching the parameter
classification of the parameter object passed to the
``SensorEvaluation`` class during instantation.
daily_ref_df (pandas DataFrame):
Dataset containing reference data at 24-hour averaging intervals
for methods measuring parameters matching the parameter
classification of the parameter object passed to the
``SensorEvaluation`` class during instantation.
pm_hourly_ref_df (pandas DataFrame):
Dataset containing reference data at 1-hour averaging intervals
for methods measuring particulate matter parameters.
pm_daily_ref_df (pandas DataFrame):
Dataset containing reference data at 24-hour averaging intervals
for methods measuring particulate matter parameters.
gas_hourly_ref_df (pandas DataFrame):
Dataset containing reference data at 1-hour averaging intervals
for methods measuring gaseous parameters.
gas_daily_ref_df (pandas DataFrame):
Dataset containing reference data at 24-hour averaging intervals
for methods measuring gaseous parameters.
met_hourly_ref_df (pandas DataFrame):
Dataset containing reference data at 1-hour averaging intervals
for methods measuring meteorological parameters.
met_daily_ref_df (pandas DataFrame):
Dataset containing reference data at 24-hour averaging intervals
for methods measuring meteorological parameters.
ref_name (str): The make and model of the FRM/FEM instrument used as
reference for the selected evaluation parameter. Both AirNowTech
and AQS return the AQS method code, and the AQS Sampling Methods
Reference table is used to determine the instrument name associated
with this code. AirNow does not return method codes or instrument
names. When the name and type of the FRM/FEM instrument are
unknown, ref_name takes the value ‘unknown_reference’.
avg_hrly_df (pandas DataFrame): Data frame containing the inter-sensor
average for concurrent sensor measurements at 1-hour averaging
intervals.
avg_daily_df (pandas DataFrame): Data frame containing the inter-sensor
average for concurrent sensor measurements at 24-hour averaging
intervals.
stats_df (pandas DataFrame): Data frame with OLS regression (sensor vs
FRM/FEM) statistics, including R2, slope, intercept, RMSE, N
(Number of sensor-FRM/FEM data point pairs), as well as the
minimum, maximum, and the mean sensor concentration.
avg_stats_df (pandas DataFrame): Data frame with OLS regression (sensor
vs intersensor average) statistics, including R2, slope,
intercept, RMSE, N (Number of concurrent sensor measurements during
which all sensors in the testing group reported values), as well as
the minimum, maximum, and the mean sensor concentration.
"""
def __init__(self, sensor, param, reference, write_to_file=False,
**kwargs):
self.sensor = sensor
self.name = sensor.name
self.reference = reference
try:
self.sensor.data
except AttributeError as error:
sys.exit(f'{error}, use the AirSensor.load_data() method to import'
f' data')
self.path = sensor.project_path
self.serials = sensor.serials
# Private to avoid confusion between SensorEvaluation attribute and
# paraeter attribute
self.param = param
self._param_name = param.name
if self._param_name not in self.sensor.param_headers:
raise AttributeError(f'{self._param_name} is not in the list of '
f'parameters measured by {self.name}')
self.write_to_file = write_to_file
self.testing_loc = _presets.test_loc
self.testing_org = _presets.test_org
# Add keyword arguments
self.__dict__.update(**kwargs)
self.kwargs = kwargs
# path to sensor figures
self.figure_path = os.path.join(self.path, 'figures', self.name, '')
# path to evaluation statistics
self.stats_path = os.path.join(self.path, 'data',
'eval_stats', self.name, '')
rec_int = self.sensor.recording_interval
self.full_df_list = list(self.sensor.data[rec_int].values())
self.hourly_df_list = list(self.sensor.data['1-hour'].values())
self.daily_df_list = list(self.sensor.data['24-hour'].values())
# Compute sensor deployment period and concurrent deployment groups
self.deploy_period_df = sensortoolkit.deploy.deployment_period(
self.full_df_list,
self.name,
self.serials)
self.deploy_dict = sensortoolkit.deploy.construct_deploy_dict(
self.deploy_period_df,
self.full_df_list,
self.hourly_df_list,
self.daily_df_list,
self.name,
**self.kwargs)
deploy_grps = self.deploy_dict['Deployment Groups']
deploy_bdate = min([pd.to_datetime(deploy_grps[grp]['eval_start'])
for grp in deploy_grps.keys()])
self.deploy_bdate = self.kwargs.get('deploy_bdate', deploy_bdate)
deploy_edate = max([pd.to_datetime(deploy_grps[grp]['eval_end'])
for grp in deploy_grps.keys()])
self.deploy_edate = self.kwargs.get('deploy_edate', deploy_edate)
self._assign_refdata_objs()
# Compute normalized param values
self.hourly_df_list = sensortoolkit.calculate.normalize(
self.hourly_df_list,
self.hourly_ref_df,
param=self._param_name,
ref_name=self.ref_name)
self.daily_df_list = sensortoolkit.calculate.normalize(
self.daily_df_list,
self.hourly_ref_df,
param=self._param_name,
ref_name=self.ref_name)
# Compute inter-sensor averaged parameter dataframes
self.avg_hrly_df = sensortoolkit.calculate.intersensor_mean(
self.hourly_df_list,
self.deploy_dict)
self.avg_daily_df = sensortoolkit.calculate.intersensor_mean(
self.daily_df_list,
self.deploy_dict)
self.stats_df = pd.DataFrame()
self.avg_stats_df = pd.DataFrame()
def _assign_refdata_objs(self):
# Retrieve reference data
self.ref_dict = self.reference.data
# Set reference dataframe based on evaluation parameter classification
self.hourly_ref_df = self.ref_dict[self.param.classifier]['1-hour']
hourly_ref_idx = self.hourly_ref_df.index
ref_param_cols = ['_Value', '_Unit', '_QAQC_Code', '_Param_Code',
'_Method', '_Method_Code', '_Method_POC']
site_cols = ['Agency', 'Site_Name', 'Site_AQS',
'Site_Lat', 'Site_Lon', 'Data_Source',
'Data_Acquisition_Date_Time']
# Unpack the ref data into dataframes. If no reference data found,
# return a dataframe backfilled with nulls.
if not self.ref_dict['PM']['1-hour'].empty:
self.pm_hourly_ref_df = self.ref_dict['PM']['1-hour']
self.pm_daily_ref_df = self.ref_dict['PM']['24-hour']
else:
cols = ['PM25' + col for col in ref_param_cols]
cols = cols + site_cols
self.pm_hourly_ref_df = pd.DataFrame(np.nan,
index=hourly_ref_idx,
columns=cols,
dtype=object)
# Replace null method names with 'Unspecified Reference'
for col_name in [col for col in cols if col.endswith('_Method')]:
self.pm_hourly_ref_df[col_name] = 'Unknown Reference'
self.pm_daily_ref_df = sensortoolkit.datetime_utils.interval_averaging(
self.pm_hourly_ref_df,
freq='D',
interval_count=24,
thres=0.75)
if not self.ref_dict['Gases']['1-hour'].empty:
self.gas_hourly_ref_df = self.ref_dict['Gases']['1-hour']
self.gas_daily_ref_df = self.ref_dict['Gases']['24-hour']
else:
cols = ['O3' + col for col in ref_param_cols]
cols = cols + site_cols
self.gas_hourly_ref_df = pd.DataFrame(np.nan,
index=hourly_ref_idx,
columns=cols,
dtype=object)
# Replace null method names with 'Unspecified Reference'
for col_name in [col for col in cols if col.endswith('_Method')]:
self.gas_hourly_ref_df[col_name] = 'Unknown Reference'
self.gas_daily_ref_df = sensortoolkit.datetime_utils.interval_averaging(
self.gas_hourly_ref_df,
freq='D',
interval_count=24,
thres=0.75)
if not self.ref_dict['Met']['1-hour'].empty:
self.met_hourly_ref_df = self.ref_dict['Met']['1-hour']
self.met_daily_ref_df = self.ref_dict['Met']['24-hour']
else:
cols = [met_param + col for col in ref_param_cols
for met_param in ['RH', 'Temp']]
cols = cols + site_cols
self.met_hourly_ref_df = pd.DataFrame(np.nan,
index=hourly_ref_idx,
columns=cols,
dtype=object)
# Replace null method names with 'Unspecified Reference'
for col_name in [col for col in cols if col.endswith('_Method')]:
self.met_hourly_ref_df[col_name] = 'Unknown Reference'
self.met_daily_ref_df = sensortoolkit.datetime_utils.interval_averaging(
self.met_hourly_ref_df,
freq='D',
interval_count=24,
thres=0.75)
# Get the name of the reference monitor
self.ref_name = self.reference.get_method_name(self.param.name)
self.daily_ref_df = self.ref_dict[self.param.classifier]['24-hour']
[docs] def add_deploy_dict_stats(self):
"""Populate deployment dictionary with statistical metrics.
Add precision and error performance targets metrics, include details
about reference (for selected evaluation parameter) and monitor
statistics for meteorological parameters (Temp, RH).
Calculates:
- CV for 1-hour averaged sensor datasets
- CV for 24-hour averaged sensor datasets
- RMSE for 1-hour averaged sensor datasets
- RMSE for 24-hour averaged sensor datasets
- Reference monitor concentration range, mean concentration during
testing period for 1-hour averaged measurements
- Reference monitor concentration range, mean concentration during
testing period for 24-hour averaged measurements
- Meteorological monitor measurement range, mean value for temperature
and/or relative humidity measurements at 1-hour intervals
- Meteorological monitor measurement range, mean value for temperature
and/or relative humidity measurements at 24-hour intervals
Populates:
- ``SensorEvaluation.deploy_dict``
Writes Files:
- Deployment dictionary
Returns:
None.
"""
# Compute inter-sensor precision and error metric values
# CV: 1-hour averaged sensor param
self.deploy_dict = sensortoolkit.calculate.cv(
self.hourly_df_list,
self.deploy_dict,
param=self._param_name)
# CV: 24-hour averaged sensor param
self.deploy_dict = sensortoolkit.calculate.cv(
self.daily_df_list,
self.deploy_dict,
param=self._param_name)
# RMSE: 1-hour averaged sensor param
self.deploy_dict = sensortoolkit.calculate.rmse(
self.hourly_df_list,
self.hourly_ref_df,
self.deploy_dict,
param=self._param_name)
# RMSE: 24-hour averaged sensor param
self.deploy_dict = sensortoolkit.calculate.rmse(
self.daily_df_list,
self.daily_ref_df,
self.deploy_dict,
param=self._param_name)
# Reference details for param evaluation (hourly data)
self.deploy_dict = sensortoolkit.deploy.deploy_ref_stats(
self.deploy_dict,
self.hourly_ref_df,
param=self._param_name,
ref_name=self.ref_name)
# Reference details for param evaluation (daily data)
self.deploy_dict = sensortoolkit.deploy.deploy_ref_stats(
self.deploy_dict,
self.daily_ref_df,
param=self._param_name,
ref_name=self.ref_name)
# Reference details for meteorological data (1-hr averages)
self.deploy_dict = sensortoolkit.deploy.deploy_met_stats(
self.deploy_dict,
self.hourly_df_list,
self.met_hourly_ref_df)
# Reference details for meteorological data (24-hr averages)
self.deploy_dict = sensortoolkit.deploy.deploy_met_stats(
self.deploy_dict,
self.daily_df_list,
self.met_daily_ref_df)
if self.write_to_file is True:
today = sensortoolkit.datetime_utils.get_todays_date()
# check if sensor-specific subfolder exists
if not os.path.exists(self.stats_path):
os.makedirs(self.stats_path)
with open(self.stats_path + self.name + '_' +
self._param_name + "_Evaluation_" + today +
".json", "w") as outfile:
deploy_json = json.dumps(self.deploy_dict, indent=4)
outfile.write(deploy_json)
[docs] def calculate_metrics(self):
"""Compute hourly, daily, and inter-sensor statistics dataframes.
.. note::
``calculate_metrics()`` will check whether
``SensorEvaluation.deploy_dict`` has been populated with statistics
via the ``add_deploy_dict_stats()`` method and will call this method
if the dictionary has not been populated yet.
Calculates:
- 1-hour averaged sensor vs. reference regression statistics for each
sensor
- 24-hour averaged sensor vs. reference regression statistics for each
sensor
- 1-hour averaged sensor vs. intersensor average regression statistics
for each sensor
- 24-hour averaged sensor vs. intersensor average regression statistics
for each sensor
Populates:
- ``SensorEvaluation.stats_df``
- ``SensorEvaluation.avg_stats_df``
Writes Files:
- Statistics DataFrame - Sensor vs. FRM/FEM
- Statistics DataFrame - Sensor vs. Intersensor Average
Returns:
None.
"""
try:
self.deploy_dict['Deployment Groups']['Group 1'][self._param_name]
except KeyError:
print('Populating deployment dataframe with evaluation statistics')
self.add_deploy_dict_stats()
hourly_stats = sensortoolkit.calculate.regression_stats(
sensor_df_obj=self.hourly_df_list,
ref_df_obj=self.hourly_ref_df,
deploy_dict=self.deploy_dict,
param=self._param_name,
serials=self.serials
)
daily_stats = sensortoolkit.calculate.regression_stats(
sensor_df_obj=self.daily_df_list,
ref_df_obj=self.daily_ref_df,
deploy_dict=self.deploy_dict,
param=self._param_name,
serials=self.serials
)
# Combine the statistics dataframes into one
self.stats_df = sensortoolkit.calculate.join_stats(
hourly_stats,
daily_stats,
stats_path=self.stats_path,
stats_type='individual',
write_to_file=self.write_to_file)
avg_hourly_stats = sensortoolkit.calculate.regression_stats(
sensor_df_obj=self.hourly_df_list,
ref_df_obj=self.hourly_ref_df,
deploy_dict=self.deploy_dict,
param=self._param_name,
serials=self.serials
)
avg_daily_stats = sensortoolkit.calculate.regression_stats(
sensor_df_obj=self.daily_df_list,
ref_df_obj=self.daily_ref_df,
deploy_dict=self.deploy_dict,
param=self._param_name,
serials=self.serials
)
# Combine the statistics dataframes into one
self.avg_stats_df = sensortoolkit.calculate.join_stats(
avg_hourly_stats,
avg_daily_stats,
stats_path=self.stats_path,
stats_type='average',
write_to_file=self.write_to_file)
[docs] def plot_timeseries(self, report_fmt=True, **kwargs):
"""Plot sensor and FRM/FEM reference measurements over time.
Sensor measurements are indicated by distinct colors in a discrete
color palette. FRM/FEM measurements are shown as black lines. The
x-axis indicates the date in 5-day increments (default, although
customizable). Measurement values are plotted along the y-axis.
Args:
report_fmt (bool, optional):
If true, format figure for inclusion in a performance report.
Defaults to True.
**kwargs (dict): Plotting keyword arguments.
Returns:
None.
"""
timestamp_fmt = '%Y-%m-%d %H:%M:%S'
t_start = (self.avg_hrly_df.dropna(how='all', axis=0).index[0] -
pd.Timedelta('1D')).strftime(timestamp_fmt)
t_end = (self.avg_hrly_df.dropna(how='all', axis=0).index[-1] +
pd.Timedelta('1D')).strftime(timestamp_fmt)
avg_list = self.param.averaging
param = kwargs.get('param', self._param_name)
kwargs.pop('param', None)
if len(avg_list) == 2 and report_fmt is True:
fig, axs = plt.subplots(2, 1, figsize=(10.15, 4.1))
fig.subplots_adjust(hspace=0.7)
for i, averaging_interval in enumerate(avg_list):
if averaging_interval == '1-hour':
sensor_data = self.hourly_df_list
if averaging_interval == '24-hour':
sensor_data = self.daily_df_list
ref_data = self.ref_dict[sensortoolkit.Parameter(param).classifier][averaging_interval]
ref_name = ref_data[f'{param}_Method'].unique()[0]
# Prevent Sensor_Timeplot from writing to file on first
# iteration of loop
if i == 0:
write_to_file = False
if i == len(avg_list) - 1:
write_to_file = self.write_to_file
axs[i] = sensortoolkit.plotting.sensor_timeplot(
sensor_data,
ref_data,
sensor_serials=self.serials,
param=param,
figure_path=self.figure_path,
sensor_name=self.name,
ref_name=ref_name,
bdate=t_start,
edate=t_end,
averaging_interval=averaging_interval,
report_fmt=report_fmt,
write_to_file=write_to_file,
ax=axs[i],
fig=fig,
**kwargs)
if i == 0:
axs[i].get_legend().remove()
else:
averaging_interval = kwargs.get('averaging_interval', '1-hour')
kwargs.pop('averaging_interval', None)
if '1-hour' in avg_list and averaging_interval == '1-hour':
sensor_data = self.hourly_df_list
if '24-hour' in avg_list and averaging_interval == '24-hour':
sensor_data = self.daily_df_list
ref_data = self.ref_dict[sensortoolkit.Parameter(param).classifier][averaging_interval]
ref_name = ref_data[f'{param}_Method'].unique()[0]
try:
sensor_data
except NameError as error:
sys.exit(error)
sensortoolkit.plotting.sensor_timeplot(
sensor_data,
ref_data,
sensor_serials=self.serials,
param=param,
figure_path=self.figure_path,
sensor_name=self.name,
ref_name=ref_name,
bdate=t_start,
edate=t_end,
averaging_interval=averaging_interval,
report_fmt=report_fmt,
write_to_file=self.write_to_file,
**kwargs)
[docs] def plot_metrics(self, **kwargs):
"""Regression dot/boxplots for U.S EPA performance metrics and targets
developed for PM2.5 and O3 sensor evaluations.
Results for the following metrics are shown:
- Linearity:
- :math:`R^2`: The coefficient of determination, which is a measure
of linearity between sensor
and reference measurement pairs.
- Bias:
- Slope: The slope of the ordinary least-squares regression between
sensor (y-axis) and
reference (x-axis) measurements.
- Intercept: The intercept term of the ordinary least-squares
regression between sensor (y-axis) and
reference (x-axis) measurements.
- Error:
- :math:`RMSE`: The root mean square error between sensor and
reference measurements.
- :math:`NRMSE`: The normalized root mean square error between sensor
and reference measurements, where RMSE has been normalized by the
mean reference concentration during the testing period.
- Precision:
- :math:`CV`: The coefficient of variation of concurrently recorded
sensor measurements.
- :math:`SD`: The standard deviation of concurrently recorded sensor
measurements.
Results are shown as either colored dots (if the number of sensors is
less than four) or as boxplots (if the number of sensors exceeds
three). Target ranges are indicated by gray shaded regions, and target
goals are indicated by dark gray lines. Results are grouped by data
averaging interval, including 1-hour and 24-hour intervals (note that
some pollutants such as O3 are analyzed only at 1-hour intervals due to
significant diurnal variability, so the formatting of the figure will
depend on which averaging interval(s) are indicated for the parameter
via the ``sensortoolkit.Parameter.averaging`` attribute).
Args:
**kwargs (dict): Plotting keyword arguments.
Returns:
None.
"""
try:
self.deploy_dict['Deployment Groups']['Group 1'][self._param_name]
except KeyError:
print('Populating deployment dataframe with evaluation statistics')
self.add_deploy_dict_stats()
if self.stats_df.empty:
print('Calculating OLS regression statistics for 1-hr and 24-hr '
'sensor vs. reference measurements')
self.calculate_metrics()
sensortoolkit.plotting.performance_metrics(
self.stats_df,
self.deploy_dict,
param=self._param_name,
param_averaging=self.param.averaging,
path=self.figure_path,
sensor_name=self.name,
write_to_file=self.write_to_file,
**kwargs)
[docs] def plot_sensor_scatter(self, averaging_interval='24-hour',
plot_subset=None, **kwargs):
"""Plot sensor vs FRM/FEM reference measurement pairs as scatter.
FRM/FEM reference concentrations are plotted along the x-axis, and
sensor concentrations are plotted along the y-axis. Measurement pairs
(i.e., concentration values for sensor and reference datasets recorded
at matching timestamp entries) are colored by the relative humidity
recorded by an independent meteorological instrument at the monitoring
site if RH data are located within the ``reference_object.data['Met']``
DataFrame.
Args:
averaging_interval (str, optional):
The measurement averaging intervals commonly utilized for
analyzing data corresponding the the selected parameter.
Defaults to '24-hour'.
plot_subset (list, optional):
A list of either sensor serial IDs or the keys associated with
the serial IDs in the serial dictionary. Defaults to None.
**Keyword Arguments**
:param dict report_fmt:
For displaying scatter plots on the
first page of the performance report included alongside U.S. EPA's
documents outlining recommended testing protocols, performance
metrics, and target values. Defaults to False.
:param **kwargs:
Additional keyword arguments passed to the underlying
``sensortoolkit.plotting.scatter_plotter()`` method.
Returns:
None.
"""
report_fmt = kwargs.get('report_fmt', False)
# Avoids multiple args passed to same param
kwargs.pop('report_fmt', None)
try:
self.deploy_dict['Deployment Groups']['Group 1'][self._param_name]
except KeyError:
print('Populating deployment dataframe with evaluation statistics')
self.add_deploy_dict_stats()
if self.stats_df.empty:
print('Calculating OLS regression statistics for 1-hr and 24-hr '
'sensor vs. reference measurements')
self.calculate_metrics()
avg_list = self.param.averaging
# Figuring out averaging intervals is done if report_fmt true, no
# need to check for invalid intervals passed (will be ignored in favor
# of intervals specified by Parameter.averaging)
if not report_fmt and averaging_interval not in avg_list:
txt = ('Invalid averaging interval, choose from the following: '
+ ', '.join(avg_list))
sys.exit(txt)
if (report_fmt is True and plot_subset is not None):
if len(avg_list) == 2:
# Create a 1x2 subplot, 1-hr scatter on left and 24-hr scatter
# on right for a single sensor unit (performance report page
# 1 plot)
figsize = (5.29, 3.17)
elif len(avg_list) == 1:
# Create a 1x1 subplot, 1-hr scatter with vertical colorbar
figsize = (4.3, 3.91)
else:
sys.exit('Reporting template formatted '
'figure not specified for ' + self._param_name)
fig, axs = plt.subplots(1, len(avg_list), figsize=figsize)
fig.subplots_adjust(hspace=0.7)
for i, averaging_interval in enumerate(self.param.averaging):
if averaging_interval == '1-hour':
sensor_data = self.hourly_df_list
ref_data = self.hourly_ref_df
met_data = self.met_hourly_ref_df
if averaging_interval == '24-hour':
sensor_data = self.daily_df_list
ref_data = self.daily_ref_df
met_data = self.met_daily_ref_df
# Prevent sub-routine from writing to file on first
# iteration of loop, also dont draw cbar on first loop
if i == 0:
write_to_file = False
kwargs['draw_cbar'] = False
if i == len(self.param.averaging) - 1:
write_to_file = self.write_to_file
kwargs['draw_cbar'] = True
if isinstance(axs, np.ndarray):
ax = axs[i]
multiplot = True
else:
ax = axs
multiplot = False
ax = sensortoolkit.plotting.scatter_plotter(
sensor_data,
ref_data,
self.stats_df,
deploy_dict=self.deploy_dict,
met_ref_df=met_data,
sensor_serials=self.serials,
param=self._param_name,
figure_path=self.figure_path,
sensor_name=self.name,
ref_name=self.ref_name,
averaging_interval=averaging_interval,
plot_subset=plot_subset,
write_to_file=write_to_file,
report_fmt=True,
ax=ax,
fig=fig,
**kwargs)
if multiplot:
axs[i] = ax
else:
axs = ax
# Create scatter for all sensors in an evaluation at a specified
# averaging interval
else:
report_fmt = False
# Assuming avg_list contains either only 1-hour or 24-hour
if '1-hour' in avg_list and averaging_interval == '1-hour':
sensor_data = self.hourly_df_list
ref_data = self.hourly_ref_df
if '24-hour' in avg_list and averaging_interval == '24-hour':
sensor_data = self.daily_df_list
ref_data = self.daily_ref_df
try:
sensor_data
except NameError as error:
sys.exit(error)
sensortoolkit.plotting.scatter_plotter(
sensor_data,
ref_data,
self.stats_df,
deploy_dict=self.deploy_dict,
met_ref_df=self.met_hourly_ref_df,
sensor_serials=self.serials,
param=self._param_name,
figure_path=self.figure_path,
sensor_name=self.name,
ref_name=self.ref_name,
averaging_interval=averaging_interval,
plot_subset=plot_subset,
report_fmt=report_fmt,
write_to_file=self.write_to_file,
**kwargs)
[docs] def plot_met_dist(self):
"""Plot the distribution of temperature and RH recorded by
meterological instruments at the collocation site.
Displays the relative frequency of meteorological measurements recorded
during the testing period. Temperature (left) and relative humidity
(right) measurements are displayed on separate subplots. Measurements
are grouped into 15 bins, and the frequency of measurements within bin
is normalized by the total number of measurements (i.e., the relative
frequency) is displayed as a histogram. Additionally, a polynomial
estimating the kernel density of measurements is shown for each subplot
and indicates the general distribution of measurements over the range
of recorded values.
This method will prioritize plotting meteorological measurements made
by reference instruments, as sensor measurements are commonly biased
warmer and drier than ambient conditions if measurements are made by
an onboard sensing component within the housing of the air sensor. If
no meteorological reference measurements are available, the method will
use sensor measurements; however, a disclaimer will displayed above
subplots indicating that sensor measurements are shown in the figure.
Returns:
None.
"""
met_params = ['Temp_Value', 'RH_Value']
sensortoolkit.plotting.met_distrib(self.met_hourly_ref_df[met_params],
self.avg_hrly_df,
figure_path=self.figure_path,
sensor_name=self.name,
write_to_file=self.write_to_file)
[docs] def plot_met_influence(self, met_param='Temp', report_fmt=True,
**kwargs):
"""Plot the influence meteorological parameters (temperature or
relative humidity) on sensor measurements.
Sensor measurements that have been normalized by reference measurement
values for the corresponding timestamp and are plotted along the
y-axis. Meteorological measurements as measured by temperature or
relative humidity monitors (rather than onboard sensor measurements)
are plotted along the x-axis. Scatter for each sensor are displayed as
separate colors to indicate the unique response of each sensor unit.
A gray 1:1 line indicates ideal agreement between sensor and reference
measurements over the range of meteorological conditions (i.e., a ratio
of 1 would indicate that the sensor and reference measure the same
concentration value for a given timestamp). Scatter below the 1:1
line indicates underestimation bias, and scatter above the 1:1 line
indicates overestimation bias.
Args:
met_param (str, optional):
Either ``'Temp'`` for displaying the influence of temperature
or ``'RH'`` for displaying the influence of relative humidity.
Defaults to None.
report_fmt (bool, optional):
If true, format figure for inclusion in a performance report.
Defaults to True.
**kwargs (dict): Plotting keyword arguments.
Returns:
None.
"""
# Reference data header names for met data
valid_met_params = ['Temp', 'RH']
if report_fmt is True:
fig, axs = plt.subplots(1, 2, figsize=(8.1, 3.8))
fig.subplots_adjust(hspace=0.7)
kwargs['fontsize'] = kwargs.get('fontsize', 10)
kwargs['ylims'] = kwargs.get('ylims', (-.3, 4))
for i, m_param in enumerate(valid_met_params):
# Prevent writing to file on first iteration of loop
if i == 0:
write_to_file = False
if i == 1:
write_to_file = self.write_to_file
axs[i] = sensortoolkit.plotting.normalized_met_scatter(
self.hourly_df_list,
self.hourly_ref_df,
self.avg_hrly_df,
self.met_hourly_ref_df,
self.figure_path,
param=self._param_name,
sensor_serials=self.serials,
sensor_name=self.name,
met_param=m_param,
ref_name=self.ref_name,
write_to_file=write_to_file,
report_fmt=report_fmt,
fig=fig,
ax=axs[i],
**kwargs)
if i == 0:
axs[i].get_legend().remove()
else:
# Either Temp or RH must be passed to met_param if not using report
# formatting. Report formatted plots dont require a value for
# met_param as both Temp and RH scatter are automatically plotted.
if met_param not in valid_met_params:
sys.exit(f'Invalid parameter name: {met_param}')
sensortoolkit.plotting.normalized_met_scatter(
self.hourly_df_list,
self.hourly_ref_df,
self.avg_hrly_df,
self.met_hourly_ref_df,
self.figure_path,
param=self._param_name,
sensor_serials=self.serials,
sensor_name=self.name,
met_param=met_param,
ref_name=self.ref_name,
write_to_file=self.write_to_file,
**kwargs)
[docs] def plot_sensor_met_scatter(self, averaging_interval='1-hour',
met_param='Temp',
**kwargs):
"""Plot internal sensor temp or RH measurements against collocated
reference monitor measurements.
Plots generated by this method:
* Internal sensor RH vs Reference monitor RH
* Internal sensor Temp vs Reference monitor Temp
Sensor measurements are plotted along the y-axis with reference
measurements along the x-axis. Statistical quantities are displayed
for each scatter plot including the ordinary least-squares (OLS)
regression equation, R^2, RMSE, and N (the number of measurement
pairs). The one-to-one line (indicating ideal agreement between
sensor and reference measurements) is shown as a dashed gray line.
Args:
averaging_interval (str, optional):
The measurement averaging intervals commonly utilized for
analyzing data corresponding the the selected parameter.
Defaults to '1-hour'.
met_param (str, optional):
The meteorological parameter to display. Defaults to None.
**kwargs (dict):
Plotting keyword arguments.
Returns:
None.
"""
# Data header names for met data
met_params = ['Temp', 'RH']
if met_param not in met_params:
sys.exit('Invalid parameter name: ' + str(met_param))
if averaging_interval not in self.param.averaging:
txt = ('Invalid averaging interval, choose from the following: '
+ ', '.join(self.param.averaging))
sys.exit(txt)
if averaging_interval == '1-hour':
sensor_data = self.hourly_df_list
ref_data = self.met_hourly_ref_df
if averaging_interval == '24-hour':
sensor_data = self.daily_df_list
ref_data = self.met_daily_ref_df
ref_name = ref_data[met_param + '_Method'].unique()[0]
ymin = math.floor(self.avg_hrly_df[
'mean_' + met_param + '_Value'].min())
ymax = round(self.avg_hrly_df[
'mean_' + met_param + '_Value'].max(), -1)
xmin, xmax = ymin, ymax
try:
self.deploy_dict['Deployment Groups']['Group 1'][self._param_name]
except KeyError:
print('Populating deployment dataframe with evaluation statistics')
self.add_deploy_dict_stats()
try:
self.stats_df
except AttributeError:
print('Calculating OLS regression statistics for 1-hr and 24-hr '
'sensor vs. reference measurements')
self.calculate_metrics()
fontsize = sensortoolkit.plotting.set_fontsize(self.serials)
# Set keyword argument values to defaults or passed values
kwargs['fontsize'] = kwargs.get('fontsize', fontsize)
kwargs['ylims'] = kwargs.get('ylims', (ymin, ymax))
kwargs['xlims'] = kwargs.get('xlims', (xmin, xmax))
kwargs['param_class'] = 'Met'
kwargs['tick_spacing'] = kwargs.get('tick_spacing', 10)
kwargs['show_colorbar'] = False
sensortoolkit.plotting.scatter_plotter(
sensor_data,
ref_data,
deploy_dict=self.deploy_dict,
param=met_param,
sensor_name=self.name,
ref_name=ref_name,
averaging_interval=averaging_interval,
figure_path=self.figure_path,
write_to_file=self.write_to_file,
sensor_serials=self.serials,
**kwargs)
[docs] def print_eval_metrics(self, averaging_interval='24-hour'):
"""Display a summary of performance evaluation results using
EPA’s recommended performance metrics (‘PM25’ and ‘O3’).
The coefficient of variation, sensor vs FRM/FEM OLS regression slope,
intercept, and R2, and RMSE are displayed. Regression statistics
are computed for each sensor, and the mean metric value is
presented alongside the range (min to max).
Args:
averaging_interval (dict, optional):
The measurement averaging intervals commonly utilized for
analyzing data corresponding the the selected parameter.
Defaults to '24-hour'.
Returns:
None.
"""
try:
self.deploy_dict['Deployment Groups']['Group 1'][self._param_name]
except KeyError:
print('Populating deployment dataframe with evaluation statistics')
self.add_deploy_dict_stats()
if self.stats_df.empty:
self.calculate_metrics()
param = self._param_name
deploy_dic = self.deploy_dict
deploy_stats = self.stats_df.where(
self.stats_df['Averaging Interval'] == averaging_interval)
print(88*'-')
print('{:^88s}'.format(self.name + ' '
+ averaging_interval +
' Performance Evaluation Results'))
print('{:^88s}'.format('Reference Method: ' + self.ref_name))
print(88*'-')
print('{:^6s}|{:^24s}|{:^24s}|{:^24s}|{:^6s}'.format('CV', 'Slope',
'Intercept', 'R^2', 'RMSE'))
print(88*'-')
cv_data = [(deploy_dic['Deployment Groups'][group]
[param]['Precision']['cv_' + averaging_interval])
for group in deploy_dic['Deployment Groups']]
slope_avg = deploy_stats.Slope.mean()
slope_min = deploy_stats.Slope.min()
slope_max = deploy_stats.Slope.max()
intercept_avg = deploy_stats.Intercept.mean()
intercept_min = deploy_stats.Intercept.min()
intercept__max = deploy_stats.Intercept.max()
linearity_avg = deploy_stats['R$^2$'].mean()
linearity_min = deploy_stats['R$^2$'].min()
linearity_max = deploy_stats['R$^2$'].max()
rmse_data = [(deploy_dic['Deployment Groups'][group]
[param]['Error']['rmse_' + averaging_interval])
for group in deploy_dic['Deployment Groups']]
print(('{:^6.1f}|{:^24.2f}|'
'{:^24.2f}|{:^24.2f}|{:^6.1f}').format(cv_data[0],
slope_avg,
intercept_avg,
linearity_avg,
rmse_data[0]))
print(5*' ',
('| ({:4.2f} to {:4.2f}) '
'| ({:4.2f} to {:4.2f}) '
'| ({:4.2f} to {:4.2f}) |').format(slope_min,
slope_max,
intercept_min,
intercept__max,
linearity_min,
linearity_max),
5*' ')
[docs] def print_eval_conditions(self, averaging_interval='24-hour'):
"""Display conditions for the evaluation parameter and meteorological
conditions during the testing period.
Values for the evaluation parameter recorded by the sensor, FRM/FEM
instrument, and temperature and relative humidity values are
displayed by the mean of 1-hour or 24-hour averages during the
testing period. The range (min to max) of each parameter is listed
below the mean in parentheses.
Args:
averaging_interval (str, optional):
The measurement averaging intervals commonly utilized for
analyzing data corresponding the the selected parameter.
Defaults to '24-hour'.
Returns:
None.
"""
try:
self.deploy_dict['Deployment Groups']['Group 1'][self._param_name]
except KeyError:
print('Populating deployment dataframe with evaluation statistics')
self.add_deploy_dict_stats()
if self.stats_df.empty:
self.calculate_metrics()
if averaging_interval == '1-hour':
ref_df = self.hourly_ref_df
met_ref_df = self.met_hourly_ref_df
if averaging_interval == '24-hour':
ref_df = self.daily_ref_df
met_ref_df = self.met_daily_ref_df
deploy_dict = self.deploy_dict
deploy_stats = self.stats_df.where(
self.stats_df['Averaging Interval'] == averaging_interval
).dropna(how='all', axis=0)
n_sensors = len(self.serials)
print(88*'-')
print('{:^88s}'.format(self.name + ' (' + str(n_sensors) + ') '
+ averaging_interval +
' Evaluation Conditions'))
print(88*'-')
print('{:^14s}|{:^14s}|{:^14s}|{:^14s}|{:^14s}|{:^14s}'.format(
'Eval period', 'Duration', 'Sensor ' + self._param_name,
'Ref ' + self._param_name, 'Temp', 'RH'))
print(88*'-')
deploy_loc = deploy_dict['Deployment Groups']
eval_start = [pd.to_datetime(deploy_loc[group]['eval_start']
).strftime('%m-%d-%y')
for group in deploy_loc]
eval_end = [pd.to_datetime(deploy_loc[group]['eval_end']
).strftime('%m-%d-%y')
for group in deploy_loc]
eval_duration = [str(pd.to_timedelta(
deploy_loc[group]['eval_duration']
).round('D').days) + ' days'
for group in deploy_dict['Deployment Groups']]
sensor_min = format(deploy_stats.Sensor_Min.min(), '3.1f')
sensor_max = format(deploy_stats.Sensor_Max.max(), '3.1f')
sensor_mean = format(deploy_stats.Sensor_Mean.mean(), '3.1f')
ref_min = format(ref_df[self._param_name + '_Value'].min(), '3.1f')
ref_max = format(ref_df[self._param_name + '_Value'].max(), '3.1f')
ref_mean = format(ref_df[self._param_name + '_Value'].mean(), '3.1f')
temp_min = format(met_ref_df['Temp_Value'].min(), '2.0f')
temp_max = format(met_ref_df['Temp_Value'].max(), '2.0f')
temp_mean = format(met_ref_df['Temp_Value'].mean(), '2.0f')
rh_min = format(met_ref_df['RH_Value'].min(), '2.0f')
rh_max = format(met_ref_df['RH_Value'].max(), '2.0f')
rh_mean = format(met_ref_df['RH_Value'].mean(), '2.0f')
print(('{:^14s}|{:^14s}|{:^14s}|{:^14s}|{:^14s}|{:^14s}'
).format(eval_start[0]+'-',
eval_duration[0],
sensor_mean,
ref_mean,
temp_mean,
rh_mean))
print(('{:^14s}|{:^14s}|{:^14s}|{:^14s}|{:^14s}|{:^14s}'
).format(eval_end[0],
'',
'(' + sensor_min + ' to ' + sensor_max + ')',
'(' + ref_min + ' to ' + ref_max + ')',
'(' + temp_min + ' to ' + temp_max + ')',
'(' + rh_min + ' to ' + rh_max + ')'))