# -*- coding: utf-8 -*-
"""
Interactive module for specifying the setup configuration for both sensor
and reference data.
Users are asked to supply various details about their
dataset(s) for the purpose of data ingestion into the Sensor Data
Formatting Scheme (SDFS).
===============================================================================
@Author:
| Samuel Frederick, NSSC Contractor (ORAU)
| U.S. EPA / ORD / CEMM / AMCD / SFSB
Created:
Mon Jul 19 08:25:55 2021
Last Updated:
Mon Jul 19 08:25:55 2021
"""
import os
import sys
from textwrap import wrap
import json
import pandas as pd
from pandas.errors import EmptyDataError
import pprint
import charset_normalizer
import pytz
from pytz.exceptions import UnknownTimeZoneError
from sensortoolkit.lib_utils import (flatten_list, validate_entry,
enter_continue, copy_datasets)
from sensortoolkit.param import Parameter
from sensortoolkit.reference import preprocess_airnowtech
from sensortoolkit.ingest import standard_ingest
from sensortoolkit.datetime_utils import (interval_averaging,
get_timestamp_interval)
from sensortoolkit import _param_dict
from sensortoolkit.lib_utils._copy_datasets import _prompt_files, _check_extension
class _Setup:
"""Setup methods for Sensor and Reference data ingestion configuration.
Args:
path (str, optional):
The path to the directory where the user intends to store data,
figures, and reports relating to the sensor being testing.
Defaults to None.
"""
#params = sorted(list(Parameter.__param_dict__.keys()))
sdfs_params = [key for key in _param_dict if not _param_dict[key]['custom']]
custom_params = [key for key in _param_dict if _param_dict[key]['custom']]
data_types = ['.csv', '.txt', '.xlsx']
__banner_w__ = 79
pp = pprint.PrettyPrinter()
def __init__(self, path=None):
if path is None:
raise AttributeError('Path for working directory not specified')
self.path = path
self.data_rel_path = None
self.data_type = None
self.file_extension = None
self.header_names = None
self.header_iloc = None
self.data_row_idx = None
self.custom_ingest = False
self.use_previous_setup = False
self.sdfs_header_names = []
self.all_col_headers = []
self.timestamp_col_headers = []
self.col_headers = {}
def config(self):
"""Wrapper method for standard configuration setup.
Utilized by both sensor and reference setup schemes.
Returns:
None.
"""
# Indicate the dataset file type (.csv, .txt, .xlsx)
self.setDataExtension()
# Ask user for either directory or files to load in, parse datasets
# and copy datasets to transfer to 'data' subdirectory
self.setDataRelPath()
self.selectDataSets()
self.copyDataSets()
# Ask if using custom ingest module, if true, exit config
self.specifyCustomIngest()
if self.custom_ingest:
return
# Ask user if they have a previously configured setup.json for the device
self.loadPreviousSetup()
if self.use_previous_setup:
return
# set the row position where the data header is located
self.setHeaderIndex()
if self.header_iloc is None:
# Manually specify column names if none provided
self.setColumnHeaders()
# otherwise, specify column headers in parsedatasets, infer header at
# iloc position
self.parseDataSets()
# Specify which headers are assocaited with timestamp info
self.setTimeHeaders()
# Specify how to convert recorded parameter headers to SDFS
self.setParamHeaders()
# Specify datetime formatting for time-like columns and tzone
self.setDateTimeFormat()
self.setTimeZone()
def printSelectionBanner(self, select_type, options=[], notes=[]):
"""Display a banner indicating the current configuration step.
Args:
select_type (str): The title of the configuration section.
options (list, optional):
List of interactive options indicating keyword characters
used to modify the state of thge console and
a description of what entering that keyword does. Defaults
to [].
Example:
>>> options = ['..press X to end adding entries',
'..press D to delete the previous entry']
notes (list, optional):
A list of strings containing notes or
resources that may provide helpful context for the selection
input or operation. Defaults to [].
Returns:
None.
"""
self.end_str = '..press X to end adding entries'
self.del_str = '..press D to delete the previous entry'
self.skip_str = '..press enter to skip columns that will be dropped'
banner_c = int(self.__banner_w__ / 2)
select_len = len(select_type)
select_start = banner_c - int(select_len / 2) - 1
n_left, n_right = select_start, select_start
if select_len % 2 != 0:
n_right -= 1
flier = (n_left*'=' + ' ' + select_type + ' ' + n_right*'=')
print(flier)
if options != []:
print('Options\n-------')
options = ['\n'.join(wrap(str(l),
width=self.__banner_w__)) for l in options]
for line in options:
print(line)
if notes != []:
if options != []:
print('')
print('Notes\n-----')
notes = ['\n'.join(wrap(str(l),
width=self.__banner_w__)) for l in notes]
for line in notes:
print(line)
print(len(flier)*'=')
def add_param_attrib(self, param, attrib_key, attrib_val):
"""Assign parameter header attribute.
Search through the column index entries, if the parameter name within
the column index subdictionary, add the passed attribute key and value.
Args:
param (str):
The name of the parameter.
attrib_key (str):
The key to assign to the subdictionary entry.
attrib_val (int, float, or str):
The value to assign to the subdictionary entry.
Returns:
None.
"""
for col_idx in self.col_headers.keys():
if param in self.col_headers[col_idx]:
self.col_headers[col_idx][param][attrib_key] = attrib_val
def setDataExtension(self):
"""Select the file data extension for to the datasets that will be
loaded.
Choose the corresponding data file type for recorded
datasets from ``'.csv'``, ``'.txt'``, ``'.xlsx'``.
Returns:
None.
"""
self.printSelectionBanner('Select Data Type',
options=[self.data_types])
valid = False
while valid is False:
console_text = (f'Enter the {self.data_type} data type from the '
'list of supported data types: ')
console_text = '\n'.join(wrap(console_text,
width=self.__banner_w__))
val = input(console_text)
if str(val) not in self.data_types:
print('..invalid entry, please enter one of the listed data '
'types')
else:
self.file_extension = val
print('')
print('Selected data type:', self.file_extension)
confirm = validate_entry()
if confirm == 'y':
valid = True
print('')
def selectDataSets(self):
"""Choose the selection scheme for pointing to recorded data files.
Selection options include the following:
- ``'directory'``, which will locate and copy all of the data files
in the specified directory for the indicated data type
- ``'recursive directory'``, which will locate and copy all data
files within the specified directory and any subdirectories
contained within the indicated folder path
- ``'files'`` which copies over files that the user manually selects
within a directory.
Returns:
None.
"""
select_options = ['directory', 'recursive directory', 'files']
self.printSelectionBanner('Select Data Files or Directory',
options=[select_options])
valid = False
while valid is False:
console_text = (f'Enter how to select {self.data_type} datasets '
f'from the list of options above: ')
console_text = '\n'.join(wrap(console_text,
width=self.__banner_w__))
val = input(console_text)
if str(val) not in select_options:
print('..invalid entry, please enter one of the options '
'listed above')
else:
self._dataset_selection = val
print('')
print('Select data sets by', self._dataset_selection)
confirm = validate_entry()
if confirm == 'y':
valid = True
print('')
def copyDataSets(self):
"""Copy recorded datasets from the selected file or folder
location to the ``../data/sensor_data/[sensor_name]/raw_data``
directory path.
Returns:
None.
"""
self.printSelectionBanner('Copy Data Files to the Project Directory',
options=[])
print('')
self.file_list = copy_datasets(data_type=self.data_type,
path=self.path,
select=self._dataset_selection,
file_extension=self.file_extension,
return_filenames=True,
**self.dataset_kwargs)
enter_continue()
def specifyCustomIngest(self):
"""Ask the user whether a custom, prewritten ingestion module will be
used to import sensor data instead of the standard_ingest() method.
Returns:
None.
"""
self.printSelectionBanner('Indicate whether to use a custom ingestion method',
options=[])
confirm = validate_entry(
statement='Will a custom, prewritten ingestion module be used to import data?')
if confirm == 'y':
self.custom_ingest = True
# TODO: could make note of how to use AirSensor.load_data with custom ingest module here
enter_continue()
def loadPreviousSetup(self):
"""Ask the user if a previous setup config exists for the type of sensor
or reference dataset that they are loading. If they choose to use a
previously configured setup.json file, use the file attributes to fill
in various setup components, such as the parameter renaming, datetime
formatting, etc.
Returns:
None.
"""
self.printSelectionBanner('Indicate whether to use a previous setup configuration',
options=[])
console_text = ('Have you previously created a setup.json config '
'file that [1] matches the device type associated with '
'the selected data sets and [2] intend to use the previous '
'setup.json file to configure the current setup session?')
console_text = '\n'.join(wrap(console_text,
width=self.__banner_w__))
confirm = validate_entry(statement=console_text)
if confirm == 'y':
self.use_previous_setup = True
# Ask user to locate where the setup.json file is
valid_file = False
while valid_file is False:
# TODO: print something to console prompting user to select file
print('')
print('Select the setup.json file you wish to use')
file_path = _prompt_files(single_file=True)
valid_file = _check_extension(file_path, '.json')
enter_continue()
# import that json
with open(file_path) as p:
previous_setup_data = json.loads(p.read())
p.close()
# extract attributes
self.header_iloc = previous_setup_data['header_iloc']
self.data_row_idx = previous_setup_data['data_row_idx']
self.sdfs_header_names = previous_setup_data['sdfs_header_names']
self.parseDataSets()
previous_col_headers = previous_setup_data['col_headers']
col_descrip = {}
for col_idx in previous_col_headers:
for label in previous_col_headers[col_idx]:
if label not in col_descrip:
col_descrip[label] = {}
col_descrip[label]['sdfs_param'] = previous_col_headers[col_idx][label]['sdfs_param']
col_descrip[label]['header_class'] = previous_col_headers[col_idx][label]['header_class']
if col_descrip[label]['header_class'] == 'datetime':
col_descrip[label]['dt_format'] = previous_col_headers[col_idx][label]['dt_format']
col_descrip[label]['dt_timezone'] = previous_col_headers[col_idx][label]['dt_timezone']
col_descrip[label]['drop'] = previous_col_headers[col_idx][label]['drop']
if (col_descrip[label]['header_class'] == 'parameter') and (col_descrip[label]['drop'] is False):
col_descrip[label]['unit_transform'] = previous_col_headers[col_idx][label]['unit_transform']
# TODO: Implement some sort of error catching mechanism that kicks in
# if the formatting for the current datasets and previous setup config
# do not match, fall back with standard setup process (i.e.,
# set use_previous_setup to false and continue)
self._not_in_previous_setup = {}
for col_idx in self.col_headers.copy():
for label in self.col_headers[col_idx].copy():
# Check if the recorded dataset doesnt have a header,
# if so, use previously manually configured names
if previous_setup_data['header_iloc'] == None:
# label is an integer value, reassign to previously
# manually configured name
former_label = label
label = list(col_descrip.keys())[former_label]
self.col_headers[col_idx][label] = self.col_headers[col_idx].pop(former_label)
try:
self.col_headers[col_idx][label]['sdfs_param'] = col_descrip[label]['sdfs_param']
self.col_headers[col_idx][label]['header_class'] = col_descrip[label]['header_class']
if col_descrip[label]['header_class'] == 'datetime':
self.col_headers[col_idx][label]['dt_format'] = col_descrip[label]['dt_format']
self.col_headers[col_idx][label]['dt_timezone'] = col_descrip[label]['dt_timezone']
self.col_headers[col_idx][label]['drop'] = col_descrip[label]['drop']
if (col_descrip[label]['header_class'] == 'parameter') and (col_descrip[label]['drop'] is False):
self.col_headers[col_idx][label]['unit_transform'] = col_descrip[label]['unit_transform']
except KeyError as e:
# header in the current datasets but not in previous setup
# For now, just assume the columns are not going to be used
if label in self.all_col_headers:
self.col_headers[col_idx][label]['sdfs_param'] = ''
self.col_headers[col_idx][label]['header_class'] = 'parameter'
self.col_headers[col_idx][label]['drop'] = True
if col_idx not in self._not_in_previous_setup:
self._not_in_previous_setup[col_idx] = {}
self._not_in_previous_setup[col_idx][label] = {'sdfs_param': '',
'header_class': '',
'drop': False}
continue
# Ask the user to specify attributes for columns that
# didnt appear in the previously configured setup.
if self._not_in_previous_setup != {}:
new_cols = []
for col_idx in self._not_in_previous_setup:
new_cols.extend(self._not_in_previous_setup[col_idx].keys())
new_cols = list(set(new_cols))
self.all_col_headers.extend(new_cols)
self.all_col_headers = list(set(self.all_col_headers))
self.setTimeHeaders(
print_statement=f'\nFrom the following list of column names, enter the names of columns which contain timestamps\n{new_cols}')
self.setDateTimeFormat()
self.setTimeZone()
self.setParamHeaders(col_list=new_cols)
def loadDataFile(self, file, **kwargs):
"""Helper function for loading the first few rows of recorded datasets.
Args:
file (str):
Full path to dataset file.
**Keyword Arguments:**
:param int nrows:
The number of rows to load for the passed dataset. Defaults to 1.
Raises:
TypeError: If data type is not in the list of valid extensions.
Returns:
df (pandas DataFrame):
A DataFrame containing the first few rows of recorded datasets.
"""
load_table = kwargs.get('load_table', False)
encoding = kwargs.get('encoding', None)
if load_table:
df = pd.read_table(file,
nrows=kwargs.get('nrows', 1),
header=None,
encoding=encoding)
elif self.file_extension == '.csv' or self.file_extension == '.txt':
df = pd.read_csv(file, header=self.header_iloc,
names=self.header_names,
nrows=kwargs.get('nrows', 1),
skiprows=self.data_row_idx,
on_bad_lines='skip',
encoding=encoding
)
elif self.file_extension == '.xlsx':
df = pd.read_excel(file, header=self.header_iloc,
names=self.header_names,
nrows=kwargs.get('nrows', 1),
skiprows=self.data_row_idx,
encoding=encoding
)
else:
raise TypeError('Invalid data type')
return df
def setDataRelPath(self):
"""Assign the relative path for the recorded dataset subdirectory.
The relative path stems from the project path.
For sensor data, the relative path to raw (recorded datasets)
should appear something like:
``/data/sensor_data/[sensor_name]/raw_data`` where 'sensor_name' is the
name given to the air sensor.
For reference datasets, the relative path to raw (recorded datasets)
should appear something like:
``/data/reference_data/[reference_data_source]/raw/[sitename_siteid]``
where 'reference_data_source' is the source or api service from which
data were acquired, 'sitename' is the name given to the site, and
'siteid' is the AQS id for the site (if applicable).
Returns:
None.
"""
self.data_rel_path = f'/data/{self.data_type}_data/'
if self.data_type == 'sensor':
self.data_rel_path += f'{self.name}/raw_data'
if self.data_type == 'reference':
self.data_rel_path += f'{self.dataset_kwargs["ref_data_source"]}/raw/{self.ref_data_subfolder}/'
def parseDataSets(self, print_banner=True):
"""Load the first few rows of recorded sensor datasets located in the
``../data/sensor_data/[sensor_name]/raw_data`` directory path.
The names of column headers are located based on the
indicated head index. A list of unique column headers is stored for
subsequent reassignment of column header names.
Args:
print_banner (bool, optional):
If ``'True'``, a banner indicating the title of the section,
user input options, and notes will be printed to the console.
Defaults to True.
Returns:
None.
"""
if print_banner:
self.printSelectionBanner('Parsing Datasets',
options=[])
print('')
# Load data files and populate a dictionary of unique headers that
# occur. Top level is ordered by the row index, so if some files have
# different headers, there will be multiple entries within the
# particular row index key.
print(f'Parsing datasets at "..{self.data_rel_path}"')
print('')
self.encoding_predictions = {}
for i, file in enumerate(self.file_list):
# Try loading with utf-8 encoding, if error raised, predict encoding
try:
df = self.loadDataFile(file)
except UnicodeDecodeError:
print('[WARNING]: Reading the following dataset with uft-8 encoding '
'unsuccessful')
print(file.replace(self.data_rel_path, ''))
print('..Attempting to guess encoding')
with open(file, 'rb') as f:
data = f.read(10000)
prediction = charset_normalizer.detect(data)
print('..encoding prediction:')
print(f'....{prediction}')
print('')
try:
df = self.loadDataFile(file, encoding=prediction['encoding'])
self.encoding_predictions[str(i)] = prediction['encoding']
except UnicodeError as e:
print('Error encountered in file:', file)
print(e)
print(f'Encoding prediction {prediction["encoding"]} unsuccessful for {file}')
#self.encoding_predictions[str(i)] = prediction['encoding']
except UnicodeDecodeError as e:
print('Error encountered in file:', file)
print(e)
print(f'Encoding prediction {prediction["encoding"]} unsuccessful for {file}')
except EmptyDataError as e:
print(f'[Warning] {e}:')
print(f' {file}')
print('')
file_col_list = list(df.columns)
for j, col in enumerate(file_col_list):
if 'col_idx_' + str(j) not in self.col_headers:
self.col_headers['col_idx_' + str(j)] = {}
if col not in self.col_headers['col_idx_' + str(j)]:
self.col_headers['col_idx_' + str(j)][col] = {"sdfs_param": None,
"in_file_list_idx": [i]}
else:
self.col_headers['col_idx_' + str(j)][col]["in_file_list_idx"].append(i)
# Create a nested list of unique column names
col_list = [list(self.col_headers[key].keys()) for key in
list(self.col_headers.keys())]
self.all_col_headers = flatten_list(col_list)
for i, cols in enumerate(col_list):
print('..Header(s) at column index {0:d}: {1}'.format(i, cols))
enter_continue()
def setHeaderIndex(self, print_banner=True):
"""Select the integer index position for the row containing headers.
Args:
print_banner (bool, optional):
If ``'True'``, a banner indicating the title of the section,
user input options, and notes will be printed to the console.
Defaults to True.
Returns:
None.
"""
if print_banner:
self.printSelectionBanner('Column Header Index',
options=['..type "None" if no header '
'columns in recorded sensor '
'dataset'])
# Load the first dataset (display 10 rows to user)
if self.file_list == []:
data_path = os.path.normpath(os.path.join(self.path,
self.data_ref_path))
sys.exit('No data files found with type'
' {0} at {1}'.format(self.file_extension, data_path))
# First try loading with utf-8 encoding, if error raised, try utf-16
file = self.file_list[0]
try:
df = self.loadDataFile(file,
nrows=10,
load_table=True)
except UnicodeDecodeError:
print('[WARNING]: Reading the following dataset with uft-8 encoding '
'unsuccessful')
print(file.replace(self.data_rel_path, ''))
print('..Attempting to guess encoding')
with open(file, 'rb') as f:
data = f.read(10000)
prediction = charset_normalizer.detect(data)
print('..encoding prediction:')
print(f'....{prediction}')
try:
df = self.loadDataFile(file,
nrows=10,
load_table=True,
encoding=prediction['encoding'])
except UnicodeError as e:
print('')
print('Error encountered in file:', file)
print(e)
print(f'Encoding prediction {prediction["encoding"]} unsuccessful for {file}')
#self.encoding_predictions[str(i)] = prediction['encoding']
except UnicodeDecodeError as e:
print('')
print('Error encountered in file:', file)
print(e)
print(f'Encoding prediction {prediction["encoding"]} unsuccessful for {file}')
except EmptyDataError as e:
print(f'[Warning] {e}:')
print(f' {file}')
print('')
filename = file.split('/')[-1]
print('')
print('The first ten unformatted rows of {0} are displayed'
' below:'.format(filename))
print(df.head(n=10))
valid = False
while valid is False:
self.header_iloc = input('Enter the row index number for column '
'headers: ')
try:
self.header_iloc = int(self.header_iloc)
except ValueError:
self.header_iloc = self.header_iloc
if (self.header_iloc != 'None' and
type(self.header_iloc) is not int):
print('..invalid entry, enter either an integer or "None"')
elif self.header_iloc == 'None':
self.header_iloc = None
valid = True
elif type(self.header_iloc) is int:
if self.header_iloc < 0:
print('..invalid entry, enter either an integer or "None"')
else:
valid = True
print('')
print('Header row index:', str(self.header_iloc))
confirm = validate_entry()
if confirm == 'n':
self.setHeaderIndex(print_banner=False)
print('')
def setColumnHeaders(self, print_banner=True):
"""Manually set column headers if the user indicates ``'None'`` for the
row index for the column headers in ``setHeaderIndex()``.
Args:
print_banner (bool, optional):
If ``'True'``, a banner indicating the title of the section,
user input options, and notes will be printed to the console.
Defaults to True.
Raises:
ValueError: Raise if the value of the entered index is invalid
(less than zero).
Returns:
None.
"""
if print_banner:
self.printSelectionBanner('Manually Set Column Headers',
options=[self.end_str])
self.header_names = []
edit = True
col_n = 1
while edit:
confirm = 'n'
while confirm == 'n':
col_name = input("Enter Column Header #{0}: ".format(str(col_n)))
if col_name == 'X':
edit = False
break
# Shortcut method for copying and pasting list of columns into
# first entry
replace_strs = ['\n', ' ', '"']
for char in replace_strs:
col_name = col_name.replace(char, '')
col_list = col_name.split(',')
if len(col_list) > 1:
# Assign only if list of strings passed
if '[' in col_list[0] and ']' in col_list[-1]:
col_list[0] = col_list[0].replace('[', '')
col_list[-1] = col_list[-1].replace(']', '')
print('..assigning column names based on passed list')
self.header_names = col_list
edit = False
break
confirm = validate_entry()
if edit is False:
break
else:
self.header_names.append(col_name)
col_n += 1
print('')
print('Column Headers:', self.header_names)
enter_continue()
confirm = 'n'
while confirm == 'n':
self.data_row_idx = input("Enter the row index that data begin "
"on: ")
try:
self.data_row_idx = int(self.data_row_idx)
if self.data_row_idx < 0:
raise ValueError
confirm = validate_entry()
except ValueError:
print('..invalid entry, enter an integer >= 0')
print('')
def setTimeHeaders(self, print_banner=True, print_statement=None):
"""Specify the column(s) containing date/timestamp information.
Args:
print_banner (bool, optional):
If ``'True'``, a banner indicating the title of the section,
user input options, and notes will be printed to the console.
Defaults to True.
Returns:
None.
"""
if print_banner:
self.printSelectionBanner('Specify Timestamp columns',
options=[self.end_str, self.del_str])
# Create a list of time-like columns, update the col_headers list with
# the DateTime type corresponding to the specified header name
# Enter in the time like columns [LOOP]
if print_statement is not None:
print(print_statement)
end = False
i = 1
while end is False:
val = input("Enter Timestamp column name #{0}: ".format(str(i)))
if val == 'X':
end = True
elif val == 'D':
try:
self.timestamp_col_headers.pop(i-2)
print('..removing timestamp column #{0} from '
'list'.format(str(i-1)))
i -= 2
print('..updated timestamp column headers list: ')
print(' ', self.timestamp_col_headers)
except IndexError:
print('Empty list, no entries to delete')
continue
elif val in self.all_col_headers:
self.timestamp_col_headers.append(val)
self.add_param_attrib(val,
attrib_key='header_class',
attrib_val='datetime')
self.add_param_attrib(val,
attrib_key='sdfs_param',
attrib_val='DateTime')
self.add_param_attrib(val,
attrib_key='drop',
attrib_val=False)
else:
print('..Invalid entry. Choose from the following list:')
print(' ', self.all_col_headers)
continue
i += 1
print('\nTimestamp column list:', self.timestamp_col_headers)
enter_continue()
def setParamHeaders(self, print_banner=True, col_list=None):
"""Select the SDFS parameters corresponding to column names discovered
by ``ParseDataSets()``.
A parameter renaming dictionary is created for reassigning the names
of header labels.
Args:
print_banner (bool, optional):
If ``'True'``, a banner indicating the title of the section,
user input options, and notes will be printed to the console.
Defaults to True.
Returns:
None.
"""
param_types = {'S': 'The header corresponds to an SDFS Parameter',
'C': 'The header corresponds to an existing custom Parameter',
'N': 'Create a new custom Parameter for the header',
'': '(enter key) Skip the current header and drop from SDFS datasets'}
pretty_params = pprint.pformat(param_types)
if print_banner:
#txt = 'Choose from the following list of SDFS parameter names'
self.printSelectionBanner('Specify Parameter columns',
options=[self.skip_str],
#notes=[txt, self.params]
)
# drop time-like columns and ask user for SDFS parameter associated
# with remaining cols
# param_col_list = [param for param in self.all_col_headers
# if param not in self.timestamp_col_headers)]
if col_list is None:
param_col_list = list(set(param for param in self.all_col_headers
if param not in self.timestamp_col_headers))
else:
param_col_list = list(set(param for param in col_list if param
not in self.timestamp_col_headers))
n_params = len(param_col_list)
renaming_dict = {}
for i, rec_header in enumerate(param_col_list, 1):
valid = False
while valid is False:
print(f'\n[{i}/{n_params}]')
print('-----')
header_type = input('Enter the character indicating the type of'
f' parameter \n{pretty_params}\n\nParameter type for header'
f' name "{rec_header}": ')
if header_type == 'S':
print('SDFS Parameters:')
print(self.sdfs_params)
set_header = input(f'From the list above, select the SDFS '
f'parameter associated with {rec_header}: ')
if set_header in self.sdfs_params:
valid = True
self.sdfs_header_names.append(set_header)
self.sdfs_header_names = list(set(self.sdfs_header_names))
if self.data_type == 'reference':
self.setParamMetaCols(rec_header, set_header)
unit_transform = self.checkParamUnits(rec_header, set_header)
self.add_param_attrib(rec_header,
attrib_key='unit_transform',
attrib_val=unit_transform)
drop = False
else:
print('..Invalid entry')
if header_type == 'C':
if self.custom_params != []:
set_header = input('Enter custom parameter associated with '
f'{rec_header}: ')
print(self.custom_params)
if set_header in self.sdfs_params:
valid = True
drop = False
else:
print('..Invalid entry')
else:
print('No custom Parameters previously configured')
if header_type == 'N':
set_header = input('Enter new custom parameter associated with '
f'{rec_header}: ')
response = validate_entry(statement=f'Do you wish to save {set_header} to the catalog of sensortoolkit.Parameter attributes?')
if response == 'y':
save_param = True
else:
save_param = False
print('')
Parameter(set_header, save_custom_param=save_param)
valid = True
drop = False
if header_type == '':
valid = True
print('..{0} will be dropped'.format(rec_header))
drop = True
set_header = ''
if header_type not in param_types.keys():
print('..Invalid parameter header type')
renaming_dict[rec_header] = set_header
self.add_param_attrib(rec_header,
attrib_key='header_class',
attrib_val='parameter')
self.add_param_attrib(rec_header,
attrib_key='sdfs_param',
attrib_val=set_header)
self.add_param_attrib(rec_header,
attrib_key='drop',
attrib_val=drop)
#TODO: Print dictionary with renaming scheme, ask to confirm
# add something like following code block,
print('')
print('Configured renaming scheme:')
self.pp.pprint(renaming_dict)
enter_continue()
def checkParamUnits(self, param, sdfs_param):
"""Prompt user to indicate whether units for passed parameter are the
same as the preset units specified for the corresponding SDFS parameter.
Args:
param (str):
The name of the parameter as logged in recorded datasets.
sdfs_param (str):
The name of the SDFS parameter corresponding to the recorded
parameter.
Returns:
val (int, float, or Nonetype): A scalar quantity for converting the
concentrations from the unit basis in which data were recorded to
the unit basis for the SDFS parameter.
"""
val = None
sdfs_param_units = Parameter(sdfs_param).units
print('')
print(f' Are the units of measure [{sdfs_param_units}] for column header "{param}"?')
confirm = validate_entry(indent_statement=2)
print('')
if confirm == 'n':
if param == 'Temp' or 'DP':
print(f' Are the units of measure for {param} Fahrenheit?')
temp_confirm = validate_entry(indent_statement=2)
if temp_confirm == 'y':
print('')
print(f' {param} will be converted from Fahrenheit to Celsius')
val = 'f_c'
else:
print(' Temperature must be in either degree Fahrenheit or Celsius')
else:
val = input(' Enter the scalar quanitity for converting the '
'recorded measurements to the following unit basis: '
f'{sdfs_param_units}')
return val
def setDateTimeFormat(self):
"""Configure the date/time formatting for date/time column(s) specified
in ``setTimeHeaders()``.
Returns:
None.
"""
cite = ('..format code list: https://docs.python.org/3/library/'
'datetime.html#strftime-and-strptime-format-codes')
epoch = ('..If a timestamp column is formatted as the number of '
'seconds since the Unix epoch (1 Jan. 1970), enter "epoch"')
self.printSelectionBanner('Configure Timestamp Column Formatting',
options=[epoch, self.skip_str],
notes=[cite])
self.time_format_dict = {}
for col in self.timestamp_col_headers:
# Pass over previously configured timestamp columns (when using
# loadPreviousSetup())
for col_idx in self.col_headers.keys():
if col in self.col_headers[col_idx]:
col_attribs = self.col_headers[col_idx][col]
if 'dt_format' in col_attribs:
continue
invalid = True
while invalid is True:
val = input('Enter date/time formatting for "' + col + '": ')
if val == '':
self.add_param_attrib(col,
attrib_key='drop',
attrib_val=True)
invalid = False
continue
else:
confirm = validate_entry()
if confirm == 'y':
invalid = False
self.time_format_dict[col] = val
self.add_param_attrib(col,
attrib_key='dt_format',
attrib_val=val)
print('')
print('Configured formatting scheme:')
self.pp.pprint(self.time_format_dict)
enter_continue()
def setTimeZone(self):
"""Select the time zone associated with the date/time column(s).
Timezones should be valid timezone names recognized by the ``pytz``
library.
Returns:
None.
"""
self.printSelectionBanner('Specify DateTime Index Time Zone',
options=[self.skip_str],
notes=['For a list of all time zones, type'
' "pytz.all_timezones"'])
for col in self.timestamp_col_headers:
# Pass over previously configured timestamp columns (when using
# loadPreviousSetup())
for col_idx in self.col_headers.keys():
if col in self.col_headers[col_idx]:
col_attribs = self.col_headers[col_idx][col]
if ('dt_timezone' in col_attribs):
continue
invalid = True
while invalid is True:
val = input('Enter time zone for "' + col + '": ')
if val == '':
# timezone is unspecified
print('..time zone not specified, continuing with tz-naive'
' DateTime index')
tzone = None
self.time_format_dict[col + '_tz'] = tzone
invalid = False
continue
else:
try:
tzone = pytz.timezone(val)
except UnknownTimeZoneError:
print('..invalid time zone')
continue
confirm = validate_entry()
if confirm == 'y':
invalid = False
self.time_format_dict[col + '_tz'] = tzone.zone
self.add_param_attrib(col,
attrib_key='dt_timezone',
attrib_val=tzone.zone)
print('')
print('Configured time zone formatting:')
self.pp.pprint(self.time_format_dict)
enter_continue()
def exportSetup(self):
"""Save the setup configuration to a ``setup.json`` file.
Returns:
None.
"""
self.printSelectionBanner('Setup Configuration')
self.config_dict = self.__dict__.copy()
drop_attribs = ['end_str', 'del_str', 'skip_str', 'header_names',
'timestamp_col_headers', 'time_format_dict',
'all_col_headers']
for attrib in drop_attribs:
try:
del self.config_dict[attrib]
except KeyError:
pass
if self.data_type == 'sensor':
filename = self.name + '_setup.json'
sensor_path = os.path.normpath(
os.path.join(self.data_rel_path, '..'))
outpath = os.path.normpath(self.path + sensor_path)
if self.data_type == 'reference':
filename = 'reference_setup.json'
outpath = os.path.normpath(self.path + self.data_rel_path)
if not os.path.isdir(outpath):
os.makedirs(outpath)
self.outpath = os.path.join(outpath, filename)
print('')
print('..writing setup configuration to the following path:')
print(self.outpath)
print('')
with open(self.outpath, 'w') as outfile:
self.config_dict = json.dumps(self.config_dict, indent=4)
outfile.write(self.config_dict)
[docs]class SensorSetup(_Setup):
"""Interactive class for handling the sensor data ingestion process.
Users specify various attributes about sensor datasets, including column
names for parameter data and timestamp entries. A renaming scheme is then
constructed for converting the original naming scheme for columns into
a standardized format for parameter names. The formatting for columns with
date or time-like entries is then specified. The file type for sensor
data is selected from a dictionary of valid data types than can be
ingested.
Args:
name (str):
The name assigned to the air sensor. Typically incudes the sensor
make (manufacturer) and model.
path (str, optional):
The path to the directory where the user intends to store data,
figures, and reports relating to the sensor being testing.
Defaults to None.
"""
def __init__(self, name, path=None):
super().__init__(path)
self.name = name
self.data_type = 'sensor'
self.dataset_kwargs = {'name':self.name}
self.config()
self.setSerials()
self.exportSetup()
[docs] def setSerials(self):
"""Indicate unique serial identifiers for each sensor unit tested.
The identifying keyword for each sensor unit should be indicated within
the recorded sensor dataset file names.
Returns:
None.
"""
self.printSelectionBanner('Configure Sensor Serial Identifiers',
options=[self.end_str])
print('')
self.serials = {}
edit = True
col_n = 1
abbrev_files = []
for file in self.file_list:
file = file.replace(self.path + '/data/sensor_data/' +
self.name + '/raw_data/', '')
abbrev_files.append(file)
print('..{0}'.format(file))
confirm = 'n'
while confirm == 'n':
val = input("Enter the number of unique sensors corresponding "
"to the datasets above: ")
try:
val = int(val)
except ValueError:
print('..Invalid entry, enter an integer value')
continue
confirm = validate_entry()
self.number_of_sensors = val
print('Enter unique serial identifiers for each sensor associated '
'with the datasets listed above:')
while col_n <= self.number_of_sensors:
confirm = 'n'
while confirm == 'n':
serial = input("Enter serial identifier #{0}: ".format(str(col_n)))
if serial == 'X':
edit = False
break
elif not any(serial in file for file in abbrev_files):
print('..invalid entry, identifier must be contained in '
'the filenames listed above')
else:
confirm = validate_entry()
if edit is False:
break
else:
self.serials[str(col_n)] = serial
col_n += 1
print('')
print('Configured serial identifiers:')
self.pp.pprint(self.serials)
print('')
enter_continue()
[docs]class ReferenceSetup(_Setup):
"""Interactive class for handling the reference data ingestion process.
Args:
path (str, optional):
The path to the directory where the user intends to store data,
figures, and reports relating to the sensor being testing.
Defaults to None.
"""
# Method code lookup tables
criteria_methods_path = os.path.abspath(os.path.join(__file__,
'../../reference/method_codes/methods_criteria.csv'))
criteria_lookup = pd.read_csv(criteria_methods_path)
critera_params = {'CO': 'Carbon monoxide',
'Pb_TSP': 'Lead (TSP) LC',
'Pb_PM10': 'Lead PM10 LC FRM/FEM',
'NO2': 'Nitrogen dioxide (NO2)',
'O3': 'Ozone',
'PM10': 'PM10 Total 0-10um STP',
'PM25': 'PM2.5 - Local Conditions',
'SO2': 'Sulfur dioxide'}
api_services = ['aqs', 'airnow']
met_methods_path = os.path.abspath(os.path.join(__file__,
'../../reference/method_codes/methods_met.csv'))
met_lookup = pd.read_csv(met_methods_path)
def __init__(self, path):
super().__init__(path)
self.data_type = 'reference'
self.dataset_kwargs = {'ref_data_source': None}
self.agency = None
self.site_name = None
self.site_aqs = None
self.site_lat = None
self.site_lon = None
self.selectDataSource()
self.setSiteInfo()
if self.dataset_kwargs['ref_data_source'] in self.api_services:
self.setDataRelPath()
elif self.dataset_kwargs['ref_data_source'] == 'airnowtech':
self.setDataRelPath()
self.processAirNowTech()
else:
self.config()
self.exportSetup()
if self.dataset_kwargs['ref_data_source'] == 'local':
self.localRefDataIngest()
[docs] def selectDataSource(self):
"""Select the service/source from which reference data were acquired.
Choose from the following options:
- ``'local'``: Data files aqcuired locally (e.g., local transfer
from agency overseeing reference instrumentation at air monitoring
site).
- ``'airnowtech'``: User has downloaded files from the AirNowTech
system and has saved files locally to the user’s system.
- ``'aqs'``: User will query EPA's Air Quality System (AQS) API for
reference data.
- ``'airnow'``: User will query the AirNow API for reference data.
Returns:
None.
"""
# Indicate the service used to acquire the dataset.
select_options = ['airnow', 'aqs', 'airnowtech', 'local']
self.printSelectionBanner('Select Name of Service for Data Source',
options=[select_options])
valid = False
while valid is False:
console_text= (f'Enter the name of the service from the list of'
f' options above: ')
console_text = '\n'.join(wrap(console_text,
width=self.__banner_w__))
val = input(console_text)
if str(val) not in select_options:
print('..invalid entry, please enter one of the options '
'listed above')
else:
selection = val
print('')
print('Data acquired from:', selection)
confirm = validate_entry()
if confirm == 'y':
valid = True
self.dataset_kwargs['ref_data_source'] = selection
print('')
[docs] def setSiteInfo(self):
"""Prompt user to enter various site attributes.
The user is prompted to provide the following site attributes:
- Site name
- Agency overseeing site
- Site AQS ID
- Site latitude
- Site longitude
.. important
**The following attrbiutes are required for querying API services:**
- If the reference data source is ``'aqs'``, an AQS ID must be
specified.
- If the reference data source is ``'airnow'``, the site latitude
and longitude must be specified.
Returns:
None.
"""
airdata_link = 'https://epa.maps.arcgis.com/apps/webappviewer/index.html?id=5f239fd3e72f424f98ef3d5def547eb5'
self.printSelectionBanner('Enter Ambient Air Monitoring Site Information',
options=['..press enter to skip entries'],
notes=['Site AQS ID required for AQS queries',
'Use the EPA AirData Air Quality Monitors Map to locate AQS Sites'
f' {airdata_link}'
'Site Latitude and Longitude required for AirNow queries',
' Latitude must be between -90 and +90 degrees ',
' Longitude must be between -180 and +180 degrees'])
self.agency = None
self.site_name = None
self.site_aqs = None
self.site_lat = None
self.site_lon = None
site_dict = {
'Enter the name of the monitoring site: ': 'site_name',
'Enter the name of the Agency overseeing the monitoring site: ': 'agency',
'Enter the AQS site ID (if applicable, format XX-XXX-XXXX): ': 'site_aqs',
'Enter the site latitude (in decimal coordinates): ': 'site_lat',
'Enter the site longitude (in decimal coordinates): ': 'site_lon'
}
for console_statement, attrib in zip(site_dict.keys(), site_dict.values()):
valid = False
while valid is False:
console_statement = '\n'.join(wrap(console_statement,
width=self.__banner_w__))
val = input(console_statement)
if attrib == 'site_aqs':
if val == '' and self.dataset_kwargs['ref_data_source'] == 'aqs':
print('..Invalid entry, AQS Site ID must be specified for AQS queries')
continue
elif val != '':
list_val = val.split('-')
if len(list_val) != 3:
print('..Invalid format, enter site ID in the format XX-XXX-XXXX')
continue
# length of components in aqs site ID
aqs_fmt = {'State Code': 2,
'County Code':3,
'Site Code':4}
invalid_fmt = False
for entry, expect_key, expect_len in zip(list_val, aqs_fmt.keys(), aqs_fmt.values()):
if len(entry) != expect_len:
print(f'..Invalid format for AQS Site ID {expect_key}: {entry}')
print(f'....expected code length {expect_len}')
invalid_fmt = True
if invalid_fmt:
continue
if attrib == 'site_lat' or attrib == 'site_lon':
if val == '' and self.dataset_kwargs['ref_data_source'] == 'airnow':
print('..Invalid entry, Latitude and Longitude must be specified for AirNow queries')
continue
elif val != '':
try:
cast_val = float(val)
except ValueError:
print('..Invalid entry, value must be numeric')
continue
if attrib == 'site_lat' and (cast_val <-90 or cast_val > 90):
print('..Invalid entry, Latitude must be between -90 and +90 degrees')
continue
if attrib == 'site_lon' and (cast_val <-180 or cast_val > 180):
print('..Invalid entry, Longitude must be between -180 and +180 degrees')
continue
if val == '':
print('..skipping')
valid = True
continue
confirm = validate_entry()
if confirm == 'y':
valid = True
self.__dict__.update({attrib: val})
print('')
if self.site_name == None:
self.site_name = 'Unspecified Site Name'
self.site_name = self.site_name.title()
self.fmt_site_name = self.site_name.replace(' ', '_')
if self.site_aqs == None:
self.site_aqs = 'Unspecified Site ID'
self.fmt_site_aqs = self.site_aqs.replace('-', '').replace(' ', '_')
self.dataset_kwargs['site_name'] = self.fmt_site_name
self.dataset_kwargs['site_aqs'] = self.fmt_site_aqs
self.ref_data_subfolder = '_'.join([self.fmt_site_name,
self.fmt_site_aqs])
[docs] def displayMethods(self, param_code, lookup_data):
"""Helper function for printing an abbreviated dataset of reference
methods correponding to the indicated parameter.
Args:
param_code (int):
AQS parameter code.
lookup_data (pandas DataFrame):
AQS method code lookup table containing a list of FRM/FEM
reference methods.
Returns:
table (pandas DataFrame):
A table containing a listing of reference methods designated
FRM/FEMs for the indicated parameter.
"""
with pd.option_context('display.expand_frame_repr', False,
'display.max_rows', None):
table = lookup_data[lookup_data['Parameter Code']==param_code]
print('')
print(table[['Method Code',
'Collection Description',
'Method Type']].to_markdown(index=False))
return table
[docs] def processAirNowTech(self):
"""Wrapper method for calling the ``sensortoolkit.reference.preprocess_airnowtech()``
method for converting downloaded AirNowTech datasets to SDFS format.
Returns:
None.
"""
self._dataset_selection = 'files'
self.setDataExtension()
self.copyDataSets()
self.printSelectionBanner('Pre-process AirNow-Tech Datasets',
options=[])
print('')
for file in self.file_list:
preprocess_airnowtech(file, self.path)
print('')
[docs] def localRefDataIngest(self):
"""Wrapper method for ingesting reference datasets acquired locally.
Datasets are ingested into SDFS format via the
``sensortoolkit.ingest.standard_ingest()`` method and processed datasets
are grouped into one of three parameter classifications (``'PM'``,
``'Gases'``, or ``'Met'``). These datasets are then saved in monthly
intervals to the ``../data/reference_data/local/[sitename_siteid]/processed``
directory path.
Returns:
None.
"""
self.printSelectionBanner('Ingest Local Datasets',
options=[])
process_path = os.path.normpath(os.path.join(self.outpath,
f'../../../processed/{self.ref_data_subfolder}'))
if not os.path.isdir(process_path):
os.makedirs(process_path)
parameter_classes = {}
for param in self.sdfs_header_names:
parameter_classes[param] = Parameter(param).classifier
for file in self.file_list:
df = standard_ingest(file, name=None,
setup_file_path=self.outpath)
# Separate dataframe by parameter classifier
for classifier in ['PM', 'Gases', 'Met']:
class_params = [param for param in parameter_classes
if parameter_classes[param] == classifier]
class_param_cols = []
site_cols = ['Site_Name', 'Agency', 'Site_AQS',
'Site_Lat', 'Site_Lon', 'Data_Source']
for param in class_params:
class_param_cols.extend([col for col in df.columns
if col.startswith(param)])
if class_param_cols != []:
class_param_cols.extend(site_cols)
else:
continue
class_df = df[class_param_cols]
# Save class dataframe in monthly segements
for date in pd.date_range(start=class_df.index.min(),
end=class_df.index.max()).to_period('M').unique():
month = str(date.month).zfill(2)
year = str(date.year)
month_df = class_df.loc[year + '-' + month, :]
samp_freq = get_timestamp_interval(month_df,
as_timedelta=True)
ONE_HOUR = pd.to_timedelta('60 m')
if samp_freq < ONE_HOUR:
N = ONE_HOUR / samp_freq
month_df = interval_averaging(month_df, freq='H',
interval_count=N,
thres=0.75)
# Write to processed folder as csv
filename = 'H_' + year + month + '_' + classifier + '.csv'
print(f'..{filename}')
month_df.to_csv(os.path.join(process_path, filename))