🎓 This notebook loads air quality data and meteorological data from the [Auckland Council Environmental Data Portal](https://environmentauckland.org.nz/Data/Dashboard/183). Afterwards, Descriptive Statistics describes the characteristics of a data set. It should be ready for reuse in the next steps in CRISP-DM for Data Science
Data:
Workflow Steps:
# %%capture
# !pip install -r requirements.txt
## Data Manipulation and Computation
import pandas as pd ## Data processing, file I/O
import numpy as np ## Linear algebra
from scipy import stats
from scipy.stats import skew, kurtosis, boxcox
from scipy.stats.mstats import winsorize, pearsonr
## Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
## Web Application Framework (for Dash)
import dash
# import dash_core_components as dcc
# import dash_html_components as html
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import os, time, math, logging
# import textwrap
from dotenv import load_dotenv
## Time Series Analysis
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, kpss, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pmdarima as pm
from pmdarima import preprocessing
# from pmdarima.arima import auto_arima
## Machine Learning - Model Building and Evaluation
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
# from xgboost import XGBRegressor
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from keras.layers import LSTM, Dense
# from keras.models import Sequential
# from keras.optimizers import Adam
# # from tensorflow.keras.layers import Dense, LSTM
# # from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
# # from tensorflow.keras.callbacks import EarlyStopping
import warnings ## Supress warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")
# warnings.simplefilter(action='ignore', category=FutureWarning)
display.max_rows=5
import getpass
from collections import OrderedDict
from teradataml import db_list_tables
import teradataml as tdml
from teradataml import read_csv, FillNa, valib
from teradataml import db_drop_table, fastload, OneHotEncodingFit, OneHotEncodingTransform, ScaleFit, ScaleTransform
from teradataml.context.context import *
from teradataml.dataframe.copy_to import copy_to_sql
from teradataml.dataframe.dataframe import DataFrame as TeradataDataFrame
from teradataml.options.configure import configure
from teradatasqlalchemy import (INTEGER, FLOAT, VARCHAR)
import json
from sqlalchemy import func
from prettytable import PrettyTable
def print_env_variables_as_table(env_vars):
"""
Prints the values of the specified environment variables in a table format.
:param env_vars: A dictionary of environment variable names and their descriptions.
"""
table = PrettyTable()
table.field_names = ["Variable", "Description", "Value"]
table.align = "l"
for var, description in env_vars.items():
value = os.getenv(var, 'Not set or not found')
table.add_row([var, description, value])
print(table)
## Dictionary mapping environment variable names to their descriptions
environment_variables = {
'IS_TERADATA_VANTAGE': 'Using scable Teradata Vantage vs. local machine (Laptop)',
'IS_LOADING_FROM_FILES': 'True if loading data from *.csv/xls files; False if using imported data in Teradata Vantage',
'IS_DATA_IN_TERADATA_VANTAGE': 'Using TeradataDataFrame in scalable Vantage vs. PandasDataFrame with local *.csv/xls files',
'SCHEMA_NAME': '[Teradata Vantage] Schema Name',
'TABLE_NAME': '[Teradata Vantage] Table Name',
'IS_JUPYTERLAB': 'Running in JupyterLab vs Python Dash/Vizro Dashboard',
# 'IS_DASHBOARD': 'Using a stand-alone Dashboard UI/UX (not JupyterLab)',
'IS_DEBUG': 'Debug mode is active or not',
'USE_DATA_PREFIX': "Prefix to use for data files: 'raw' | 'cleaned'",
}
## Specify the correct path to your .env file: export ENV_PATH=".env_cleaned"
# dotenv_path = os.getenv('ENV_PATH', 'raw_pandas.env')
dotenv_path = os.getenv('ENV_PATH', 'cleaned_vantage.env')
## Load environment variables from the .env file (if present)
load_dotenv(dotenv_path=dotenv_path)
print(f'The specified .env file: {dotenv_path}')
## Access environment variables as if they came from the actual environment
IS_JUPYTERLAB = os.getenv('IS_JUPYTERLAB') == 'True'
IS_LOADING_FROM_FILES = os.getenv('IS_LOADING_FROM_FILES') == 'True'
IS_TERADATA_VANTAGE = os.getenv('IS_TERADATA_VANTAGE') == 'True'
IS_DATA_IN_TERADATA_VANTAGE = os.getenv('IS_DATA_IN_TERADATA_VANTAGE') == 'True'
IS_DEBUG = os.getenv('IS_DEBUG') == 'True'
# IS_DASHBOARD = os.getenv('IS_DASHBOARD') == 'True'
USE_DATA_PREFIX = os.getenv('USE_DATA_PREFIX')
SCHEMA_NAME = os.getenv('SCHEMA_NAME')
TABLE_NAME = os.getenv('TABLE_NAME')
## Define the path to the CSV file
PATH = f'data/{USE_DATA_PREFIX}' ## 'raw' | 'cleaned'
# if USE_DATA_PREFIX.lower() == 'raw':
# PATH = 'data/raw' ## For Data_Loading_and_Descriptive_Statistics.ipynb
# elif IS_DASHBOARD:
# PATH = '../data' ## For app.ipynb
# else:
# PATH = f'data/{USE_DATA_PREFIX}' ## For Time Series & ML/DL: 'cleaned_' | 'final_'
## [DEBUG] show the relevant info from .env
if IS_DEBUG:
print(f'PATH: {PATH}')
## Call the function to print the environment variables
print_env_variables_as_table(environment_variables)
The specified .env file: cleaned_vantage.env PATH: data/cleaned +-----------------------------+---------------------------------------------------------------------------------------------+-----------------------+ | Variable | Description | Value | +-----------------------------+---------------------------------------------------------------------------------------------+-----------------------+ | IS_TERADATA_VANTAGE | Using scable Teradata Vantage vs. local machine (Laptop) | True | | IS_LOADING_FROM_FILES | True if loading data from *.csv/xls files; False if using imported data in Teradata Vantage | True | | IS_DATA_IN_TERADATA_VANTAGE | Using TeradataDataFrame in scalable Vantage vs. PandasDataFrame with local *.csv/xls files | True | | SCHEMA_NAME | [Teradata Vantage] Schema Name | Air_Pollution | | TABLE_NAME | [Teradata Vantage] Table Name | Air_Pollution_cleaned | | IS_JUPYTERLAB | Running in JupyterLab vs Python Dash/Vizro Dashboard | True | | IS_DEBUG | Debug mode is active or not | True | | USE_DATA_PREFIX | Prefix to use for data files: 'raw' | 'cleaned' | cleaned | +-----------------------------+---------------------------------------------------------------------------------------------+-----------------------+
You will be prompted to provide the password
. Enter your password, press the Enter key, then use down arrow to go to next cell.
if IS_TERADATA_VANTAGE:
%run -i ../startup.ipynb
eng = create_context(host = 'host.docker.internal', username='demo_user', password = password)
print(eng)
... Logon successful Connected as: xxxxxsql://demo_user:xxxxx@host.docker.internal/dbc Engine(teradatasql://demo_user:***@host.docker.internal)
%%capture
if IS_TERADATA_VANTAGE:
execute_sql('''SET query_band='DEMO=Data_Loading_and_Descriptive_Statistics.ipynb;' UPDATE FOR SESSION; ''')
Begin running steps with Shift + Enter
keys.
## Define Teradata Vantage: schema & table name
SCHEMA_NAME= 'Air_Pollution'
TABLE_NAME = f'Air_Pollution_{USE_DATA_PREFIX}'
## Define the mapping of original column names to new column names
columns_name_mapping = {
'Timestamp (UTC+12:00)':'Timestamp',
'AQI.Air Quality Index (AQI)':'AQI',
'PM10.Hourly Aggregate (µg/m³)':'PM10',
'PM2,5.Hourly Aggregate (µg/m³)':'PM2.5',
'NO.Hourly Aggregate (µg/m³)':'NO',
'NO2.Hourly Aggregate (µg/m³)':'NO2',
'NOx.Hourly Aggregate (µg/m³)':'NOx',
'CO.Hourly Aggregate (mg/m³)':'CO', ## No Data in both Site1 (Penrose) and Site2 (Takapuna)
'O3.Hourly Aggregate (µg/m³)':'O3', ## No Data in both Site1 (Penrose) and Site2 (Takapuna)
'SO2.Hourly Aggregate (µg/m³)':'SO2', ## No Data in Site2 (Takapuna)
'Wind Speed.Hourly Aggregate (m/s)':'Wind_Speed',
'Wind Dir.Hourly Aggregate (°)':'Wind_Dir',
'Air Temp.Hourly Aggregate (°C)':'Air_Temp',
'Rel Humidity.Hourly Aggregate (%)':'Rel_Humidity',
# 'Solar Rad.Hourly Aggregate (W/m²)':'Solar_Rad', ## No Data in both Site1 (Penrose) and Site2 (Takapuna)
}
# columns_name_mapping_cleaned = {
# 'Timestamp':'Timestamp',
# 'AQI':'AQI',
# 'PM10':'PM10',
# 'PM2.5':'PM2.5',
# 'SO2':'SO2', ## No Data in Site2
# 'NO':'NO',
# 'NO2':'NO2',
# 'NOx':'NOx',
# 'Wind_Speed':'Wind_Speed',
# 'Wind_Dir':'Wind_Dir',
# 'Air_Temp':'Air_Temp',
# 'Rel_Humidity':'Rel_Humidity',
# }
### [DEBUG] Listing of all input data files under "input/" directory: *.csv, *.xlsx
import os
print(f'\nℹ️ Load Data from {PATH}\n')
for dirname, _, filenames in os.walk(PATH):
for filename in filenames:
## Check if the filename ends with .csv or .xlsx
if filename.endswith('.csv') or filename.endswith('.xlsx'):
print(os.path.join(dirname, filename))
# pass
ℹ️ Load Data from data/cleaned data/cleaned/cleaned_Takapuna23-07May2020-to-30Apr2022.csv data/cleaned/cleaned_Penrose7-07May2020-to-30Apr2022.csv
def load_and_prepare_data(file_path, site_name, site_class, include_columns=[]):
## Read the dataset from the CSV file
print(f"\nℹ️ Load Data from {file_path} file --> rawdata DataFrame 📂")
raw_data = pd.read_csv(file_path)
# raw_data = pd.read_excel(file_path)
if USE_DATA_PREFIX.lower() == 'raw':
## Rename the columns in the DataFrame
raw_data = raw_data.rename(columns=columns_name_mapping)
# else:
# raw_data = raw_data.rename(columns=columns_name_mapping_cleaned)
## Dynamically select columns specified in columns_name_mapping that are not in exclude_columns and are present in the DataFrame
# selected_columns = [col for col in columns_name_mapping.values() if col not in exclude_columns and col in raw_data.columns]
raw_data = raw_data[include_columns]
## Add site name and site class columns
raw_data['Site'] = site_name
# raw_data['Site_Class'] = site_class
# raw_data['Country'] = 'New Zealand' ## TODO: Hard-code
## Convert 'Timestamp' column to datetime ("29/12/2020 00:00:00" as December 29, 2020) & addressing the date format warning.
# raw_data['Timestamp'] = pd.to_datetime(raw_data['Timestamp'], dayfirst=True, errors='coerce')
raw_data['Timestamp'] = pd.to_datetime(raw_data['Timestamp'], dayfirst=True)
# raw_data['Timestamp'] = raw_data['Timestamp'].tz_localize("NZ")
# raw_data['Timestamp'] = pd.to_datetime(raw_data['Timestamp'], format='%d/%m/%Y %H:%M')
return raw_data
The fastload()
API writes records from a Pandas DataFrame
to Teradata Vantage
using Fastload. FastLoad API can be used to quickly load large amounts of data in an empty table on Vantage.
Fastload
protocol is excellent for row counts over 100K - shown here as an illustration. These Teradata functions have lots of parameters to help control behaviour - the if_exists parameter is excellent, so we don't have to explicitly drop the table before loading it - or we can append it, etc. We can also use copy_to_sql for smaller row counts and more flexibility.
# def load_and_prepare_data_in_database(raw_data, schema_name, table_name, if_exists='replace'):
def load_and_prepare_data_in_database(raw_data, table_name, if_exists='replace'):
"""
Loads data from a Pandas DataFrame into a Teradata table using teradataml,
and prepares the data by performing specified SQL transformations.
Parameters:
- df: Pandas DataFrame containing the data to load.
- table_name: The name of the target table in Teradata.
- if_exists: Action to take if the target table already exists. Options include 'replace'.
"""
## [DEBUG] Little bit of code that creates an index
# raw_data['txn_id'] = range(1, len(raw_data) + 1)
## FIXME: 'Site' & 'Timestamp'
# copy_to_sql(raw_data, table_name=TABLE_NAME, if_exists="replace")
fastload(raw_data,
# schema_name=schema_name,
table_name=table_name,
index=False,
# primary_index='txn_id',
primary_index=['Site', 'Timestamp'],
if_exists = if_exists,
open_sessions=2)
# return DataFrame.from_table(table_name)
return TeradataDataFrame(table_name)
class DescriptiveStatistics:
@staticmethod
def describe_data(_data, ordinal_columns, numerical_columns=None):
## The Shape | Head (First 5 rows) | Tail (Last 5 rows) of the Dataframe
# nrow, ncol = _data.shape
# print("\nℹ️ The Shape of the Dataframe: \n", _data.shape)
# print(nrow, ncol)
## Sums up the number of True values from missing values are contained within the dataframe.
# missing_data = _data.isnull().sum()
# print("\nℹ️ Sums up missing values: \n", missing_data)
## Summary Statistics of the Dataframe such as the mean, maximum and minimum values
# print("\nℹ️ First 5 rows of the Dataframe: \n", _data.head())
# print("\nℹ️ Last 5 rows of the Dataframe: \n", _data.tail())
# print("\nℹ️ Information about the dataframe and count of the non-null values: \n", _data.info())
# print("\nℹ️ Summary Statistics (count | mean | std | min | max) of the dataframe: \n", _data.describe())
# print("\n🎓 Describing the types of each attribute as Nominal, Ordinal, or Continuous ...")
## Data-type of each columns directly from their data types except those listed in ordinal_columns
# print("\nℹ️ Data-type of each column in the dataframe: \n", _data.dtypes)
# numerical_columns = _data.select_dtypes(include=[np.number]).columns
numerical_columns = [col for col in _data.select_dtypes(include=[np.number]).columns if col not in ordinal_columns]
## Identifying nominal (categorical) columns; these are typically of object or category dtype
## Note: This includes 'Site' and 'Site_Class', and any other categorical columns present
nominal_columns = _data.select_dtypes(include=[object, 'category']).columns
# nominal_columns = _data.select_dtypes(include=['object', 'category']).columns.tolist()
## Ensuring 'Timestamp' is not mistakenly classified as nominal
numerical_columns = [col for col in numerical_columns if col not in ordinal_columns]
# Print the classification of columns
print("\nℹ️ Numerical Variables/Features: \n", numerical_columns)
print("\nℹ️ Ordinal Variables/Features: \n", ordinal_columns)
if nominal_columns is not None and len(nominal_columns) > 0:
print("\nℹ️ Nominal Variables/Features: \n", nominal_columns)
## Return the lists for further use
return numerical_columns, nominal_columns
@staticmethod
def check_duplicate_rows(_data):
"""
Verify_data_quality
"""
duplicates = _data.duplicated()
print("\nℹ️ Number of duplicate rows: ", duplicates.sum())
if duplicates.sum() > 0:
print("\nDuplicate rows: \n", _data[duplicates])
return duplicates
@staticmethod
def correlations_heatmap_with_regression(data, continuous_columns, hue_column='Site', corner=False):
"""
This function takes a DataFrame and a list of continuous column names and creates a seaborn seaborn's PairGrid
with regression lines for continuous columns, colored by hue_column, in the lower triangle,
KDE plots in the upper, and histograms in the diagonal.
Parameters:
- data (DataFrame): The data to be plotted.
- continuous_columns (list): List of continuous column names for the pairplot.
- hue_column: Column name to use for hue differentiation. Default is 'Site'.
- corner: If True, only plots the lower triangle of the pair grid to avoid redundancy. Default is False.
"""
## Check if hue_column exists: 'Site'
if hue_column not in data.columns:
raise KeyError(f"'{hue_column}' column not found in the data.")
## [DEBUG] Resetting the index so 'Site' becomes a column again, suitable for use as a hue in pairplot
## Ensure 'Site' is a column in the DataFrame for hue mapping
# df = data.reset_index() if 'Site' not in data.columns else data
# df = data.reset_index()
# Replace inf/-inf with NaN to avoid plotting issues
df = data.replace([np.inf, -np.inf], np.nan)
# ## Determine the number of unique 'Site' values
# num_unique_sites = _df['Site'].nunique()
# ## Generate a list of markers, one for each unique 'Site' value: ['o', 's', 'D', '^', ...] up to the number of unique sites
# marker_list = ['o', 's', 'D', '^', '<', '>', 'v', '*', '+', 'x'][:num_unique_sites]
## Define PairGrid
# pair_grid_kwargs = {'hue': hue_column, 'palette': "Set2", 'diag_sharey': False}
if corner:
# pair_grid = sns.PairGrid(df[continuous_columns + [hue_column]], corner=True, **pair_grid_kwargs)
pair_grid = sns.PairGrid(df, hue="Site", palette="Set2", diag_sharey=False, hue_kws={"marker": ['o', 's']}, corner=True)
else:
# pair_grid = sns.PairGrid(df[continuous_columns + [hue_column]], **pair_grid_kwargs)
pair_grid = sns.PairGrid(df, hue="Site", palette="Set2", diag_sharey=False, hue_kws={"marker": ['o', 's']})
## KDE plot for the upper triangle
pair_grid.map_upper(sns.kdeplot, shade=True)
## Histogram for the diagonal
pair_grid.map_diag(sns.histplot, alpha=0.5) ## sns.kdeplot
## [Left] Regression plot for the lower triangle
pair_grid.map_lower(sns.regplot) ## sns.scatterplot
## Add legend and adjust title
pair_grid.add_legend(loc='upper left', ncol=3)
plt.suptitle("Pairplot with Regression Lines by " + hue_column, y=1.02, size='x-large')
## Set the title of the figure
# pair_grid.fig.suptitle("Pairplot with Regression Lines by Site", y=1.01, size='x-large') ## Adjust title and its position
## Display the plot
plt.show()
@staticmethod
def correlations_matrix(df, method='pearson', figsize=(20, 10), title=None, _rotate=False):
"""
Generates a heatmap for the correlation matrix of a DataFrame.
Parameters:
- df (DataFrame): Pandas DataFrame containing only numerical columns.
- method (str): Correlation computation method, 'pearson' (default), 'spearman', or 'kendall'.
- figsize (tuple): Figure size for the heatmap.
- title (str): Title for the heatmap.
Returns:
- _correlation_matrix: The function displays a heatmap.
"""
if method not in ['pearson', 'spearman', 'kendall']: ## Validate correlation method
raise ValueError("Method should be 'pearson', 'spearman', or 'kendall'.")
## Make a copy of the df
data = df.copy()
## Check features correlation
_correlation_matrix = df.corr(method=method)
## Create a mask to hide the upper triangle
# mask = np.triu(np.ones_like(_correlation_matrix, dtype=bool))
mask = np.zeros_like(_correlation_matrix, dtype=bool)
mask[np.triu_indices_from(mask)] = True
## Set the diagonal elements of the mask to False to display self-correlation
np.fill_diagonal(mask, False)
fig, ax = plt.subplots(figsize=figsize)
plt.title(title, fontsize='x-large', y=1.01)
## Visualize the Correlation Matrix by plotting a Heatmap with the custom mask
plt.rcParams.update({'font.size': 12})
heatmap = sns.heatmap(_correlation_matrix,
annot=True, annot_kws={"fontsize": 10}, ## Adjust annotation font size
fmt=".2f", linewidths=0.5, cmap='RdBu', ## 'coolwarm' 'viridis' 'RdBu'
square=True,
# cbar_kws={"shrink": .5},
cbar_kws = {'shrink': .4,"ticks" : [-1, -.5, 0, 0.5, 1]},
vmin = -1, vmax = 1,
mask=mask, ## the mask has been included here
ax=ax)
## Display the plot
if _rotate:
## Invert the y-axis to "rotate" the heatmap
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
## Additionally, rotate the xticks if it improves readability
plt.yticks(rotation=90)
# plt.tight_layout()
plt.show()
return _correlation_matrix
# print(plt.style.available)
# plt.style.use('tableau-colorblind10')
if IS_LOADING_FROM_FILES:
numerical_columns = ['Timestamp','AQI','PM10','PM2.5','SO2','NO','NO2','NOx','Wind_Speed','Wind_Dir','Air_Temp','Rel_Humidity']
include_columns_site1 = ['Timestamp','AQI','PM10','PM2.5','SO2','NO','NO2','NOx','Wind_Speed','Wind_Dir','Air_Temp','Rel_Humidity']
include_columns_site2 = ['Timestamp','AQI','PM10','PM2.5','NO','NO2','NOx','Wind_Speed','Wind_Dir','Air_Temp','Rel_Humidity']
# if USE_DATA_PREFIX.lower() == 'raw':
# ## Define the specifications for each monitoring site
# sites_info = [
# {"path": f"{PATH}/Penrose7-07May2020-to-30Apr2022.csv", "name": "Penrose", "class": "Industrial / Traffic", "include_columns": include_columns_site1},
# {"path": f"{PATH}/Takapuna23-07May2020-to-30Apr2022.csv", "name": "Takapuna", "class": "Urban Background", "include_columns": include_columns_site2},
# # {"path": f"{PATH}/air-pm-QueenStreet9-30Dec2019-to-29Feb2024.csv", "name": "QueenStreet", "class": "Traffic Peak", "exclude_columns": ['CO', 'O3', 'Wind_Speed', 'Wind_Dir', 'Air_Temp', 'Rel_Humidity', 'Solar_Rad']}
# ]
# else:
# ## Define the specifications for each monitoring site
# sites_info = [
# {"path": f"{PATH}/{USE_DATA_PREFIX}_Penrose7-07May2020-to-30Apr2022.csv", "name": "Penrose", "class": "Industrial / Traffic", "include_columns": include_columns_site1},
# {"path": f"{PATH}/{USE_DATA_PREFIX}_Takapuna23-07May2020-to-30Apr2022.csv", "name": "Takapuna", "class": "Urban Background", "include_columns": include_columns_site2},
# # {"path": f"{PATH}/air-pm-QueenStreet9-30Dec2019-to-29Feb2024.csv", "name": "QueenStreet", "class": "Traffic Peak", "exclude_columns": ['CO', 'O3', 'Wind_Speed', 'Wind_Dir', 'Air_Temp', 'Rel_Humidity', 'Solar_Rad']}
# ]
## Define the specifications for each monitoring site
sites_info = [
{"path": f"{PATH}/{USE_DATA_PREFIX}_Penrose7-07May2020-to-30Apr2022.csv", "name": "Penrose", "class": "Industrial / Traffic", "include_columns": include_columns_site1},
{"path": f"{PATH}/{USE_DATA_PREFIX}_Takapuna23-07May2020-to-30Apr2022.csv", "name": "Takapuna", "class": "Urban Background", "include_columns": include_columns_site2},
# {"path": f"{PATH}/{USE_DATA_PREFIX}_QueenStreet9-30Dec2019-to-29Feb2024.csv", "name": "QueenStreet", "class": "Traffic Peak", "exclude_columns": ['CO', 'O3', 'Wind_Speed', 'Wind_Dir', 'Air_Temp', 'Rel_Humidity', 'Solar_Rad']}
]
## Load and prepare the data for each site
dfs = [load_and_prepare_data(site['path'], site['name'], site['class'], site['include_columns']) for site in sites_info]
## Concatenate all the dataframes into one
rawdata = pd.concat(dfs, ignore_index=True)
## Display the first few rows of the final dataframe
# rawdata.head().style.set_properties(subset=['Timestamp'], **{'background-color': 'dodgerblue'})
print("\nℹ️ The Shape of the rawdata Dataframe:", rawdata.shape)
ℹ️ Load Data from data/cleaned/cleaned_Penrose7-07May2020-to-30Apr2022.csv file --> rawdata DataFrame 📂 ℹ️ Load Data from data/cleaned/cleaned_Takapuna23-07May2020-to-30Apr2022.csv file --> rawdata DataFrame 📂 ℹ️ The Shape of the rawdata Dataframe: (34734, 13)
## Code to execute if data is in Teradata Vantage and it's the ClearScape Analytics™ environment
if IS_DATA_IN_TERADATA_VANTAGE and IS_LOADING_FROM_FILES:
print("\nℹ️ Prepare Data in Teradata Vantage from Pandas DataFrame \n")
if_exists = 'replace'
# df_table = load_and_prepare_data_in_database(rawdata, SCHEMA_NAME, TABLE_NAME, if_exists=if_exists)
df_table = load_and_prepare_data_in_database(rawdata, TABLE_NAME, if_exists=if_exists)
## [DEBUG] Info
print(f'\nℹ️ Loading Data in Teradata Vantage: {type(df_table)}; df_table.shape: {df_table.shape}')
ℹ️ Prepare Data in Teradata Vantage from Pandas DataFrame Processed 34734 rows in batch 1. ℹ️ Loading Data in Teradata Vantage: <class 'teradataml.dataframe.dataframe.DataFrame'>; df_table.shape: (34734, 13)
## Code to execute if data is in Teradata Vantage and it's the ClearScape Analytics™ environment
if IS_DATA_IN_TERADATA_VANTAGE and IS_TERADATA_VANTAGE and not IS_LOADING_FROM_FILES:
print("\nℹ️ Loading Data in Teradata Vantage to Pandas DataFrame \n")
df_table = TeradataDataFrame(TABLE_NAME, index_label=["Site", "Timestamp"])
# df_table = TeradataDataFrame.from_table(f'"{SCHEMA_NAME}"."{TABLE_NAME}"')
# df_table
# df_table.shape
# db_list_tables(object_name = "%")
rawdata = df_table.to_pandas()
rawdata
Timestamp | AQI | PM10 | PM2.5 | SO2 | NO | NO2 | NOx | Wind_Speed | Wind_Dir | Air_Temp | Rel_Humidity | Site | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-05-07 00:00:00 | 26.0 | 16.50 | 16.10 | 1.8 | 60.30 | 40.90000 | 101.20 | 0.60 | 316.0 | 8.0 | 78.00 | Penrose |
1 | 2020-05-07 01:00:00 | 28.0 | 17.70 | 10.10 | 1.0 | 16.00 | 29.20000 | 45.30 | 0.70 | 269.0 | 8.0 | 76.80 | Penrose |
2 | 2020-05-07 02:00:00 | 28.0 | 15.00 | 10.30 | 0.1 | 16.00 | 29.20000 | 45.30 | 1.00 | 180.0 | 8.0 | 78.40 | Penrose |
3 | 2020-05-07 03:00:00 | 29.0 | 14.30 | 11.40 | 0.0 | 11.20 | 27.50000 | 38.70 | 0.80 | 232.0 | 8.0 | 77.50 | Penrose |
4 | 2020-05-07 04:00:00 | 30.0 | 8.80 | 10.60 | -0.1 | 12.00 | 28.50000 | 40.50 | 0.80 | 274.0 | 7.0 | 80.10 | Penrose |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
34729 | 2022-04-30 19:00:00 | 14.0 | 4.75 | 3.30 | NaN | 0.60 | 0.00440 | 5.00 | 2.55 | 109.5 | 16.0 | 76.05 | Takapuna |
34730 | 2022-04-30 20:00:00 | 14.0 | 6.35 | 3.15 | NaN | 0.50 | 0.00365 | 4.15 | 2.45 | 105.5 | 16.0 | 74.50 | Takapuna |
34731 | 2022-04-30 21:00:00 | 14.0 | 6.05 | 2.80 | NaN | 0.40 | 0.00480 | 5.20 | 2.35 | 115.5 | 16.0 | 74.15 | Takapuna |
34732 | 2022-04-30 22:00:00 | 13.0 | 4.20 | 2.60 | NaN | 0.40 | 0.00555 | 5.90 | 1.95 | 122.5 | 16.0 | 75.95 | Takapuna |
34733 | 2022-04-30 23:00:00 | 13.0 | 5.00 | 2.80 | NaN | 0.35 | 0.00405 | 4.30 | 1.95 | 119.0 | 16.0 | 78.00 | Takapuna |
34734 rows × 13 columns
db_list_tables(object_name = "%")
TableName | |
---|---|
0 | Air_Pollution_cleaned |
IS_CLEANUP_VANTAGE=False
## Code to execute if data is in Teradata Vantage and it's the ClearScape Analytics™ environment
if IS_DATA_IN_TERADATA_VANTAGE and IS_CLEANUP_VANTAGE:
tables = [TABLE_NAME]
## Loop through the list of tables and execute the drop table command for each table
for table in tables:
try:
print(f"To drop the table {TABLE_NAME}, uncomment the following line of code")
db_drop_table(table_name=table)
except:
pass
print(f"To remove the context, uncomment the following line of code")
remove_context()
## The rawdata is retrieved from "2.1. Load Data from *.csv files" and/or "2.2. Prepare Data in Teradata Vantage"
rawdata_site1 = rawdata[rawdata['Site'] == 'Penrose'][include_columns_site1]
rawdata_site2 = rawdata[rawdata['Site'] == 'Takapuna'][include_columns_site2]
print("ℹ️ The Shape of the Dataframe rawdata_site1 (Penrose) and rawdata_site2 (Takapuna):", rawdata_site1.shape, rawdata_site2.shape)
ℹ️ The Shape of the Dataframe rawdata_site1 (Penrose) and rawdata_site2 (Takapuna): (17375, 12) (17359, 11)
# if IS_DEBUG:
## 1. Describe Data and attribute-types: numerical(continuous), categorical (nominal, ordinal)
## 'Timestamp' as the only Ordinal Attribute/Column given its nature order in time-series data
ordinal_columns = ['Timestamp']
print("\n🎓 Describing the types of each attribute as numerical_columns (Continuous), ordinal_columns (Ordinal), or nominal_columns (Nominal) ...")
numerical_columns, nominal_columns = DescriptiveStatistics.describe_data(rawdata, ordinal_columns, numerical_columns)
# print("\n🎓 0. Summary Statistics of the Dataframe such as the mean, maximum and minimum values ...")
# rawdata.describe().T.style.set_table_attributes("style='display:inline'").bar(color='#3dc4c4') ## color='dodgerblue'
# rawdata.describe()
print("\n🎓 1. [Site 1 - Penrose][numerical_columns_S1, nominal_columns_S1] Summary Statistics of the Dataframe such as the mean, maximum and minimum values ...")
numerical_columns_site1, nominal_columns_site1 = DescriptiveStatistics.describe_data(rawdata_site1, ordinal_columns, include_columns_site1)
# rawdata_site1.describe().T.style.set_table_attributes("style='display:inline'").bar(color='dodgerblue') ## color='dodgerblue' | '#3dc4c4'
print("\n🎓 2. [Site 2 - Takapuna][numerical_columns_S2, nominal_columns_S2] Summary Statistics of the {site2} Dataframe such as the mean, maximum and minimum values ...")
numerical_columns_site2, nominal_columns_site2 = DescriptiveStatistics.describe_data(rawdata_site2, ordinal_columns, include_columns_site2)
## Verify Data Quality: check_duplicate_rows and check_missing_values
# duplicate_records = DescriptiveStatistics.check_duplicate_rows(rawdata)
🎓 Describing the types of each attribute as numerical_columns (Continuous), ordinal_columns (Ordinal), or nominal_columns (Nominal) ... ℹ️ Numerical Variables/Features: ['AQI', 'PM10', 'PM2.5', 'SO2', 'NO', 'NO2', 'NOx', 'Wind_Speed', 'Wind_Dir', 'Air_Temp', 'Rel_Humidity'] ℹ️ Ordinal Variables/Features: ['Timestamp'] ℹ️ Nominal Variables/Features: Index(['Site'], dtype='object') 🎓 1. [Site 1 - Penrose][numerical_columns_S1, nominal_columns_S1] Summary Statistics of the Dataframe such as the mean, maximum and minimum values ... ℹ️ Numerical Variables/Features: ['AQI', 'PM10', 'PM2.5', 'SO2', 'NO', 'NO2', 'NOx', 'Wind_Speed', 'Wind_Dir', 'Air_Temp', 'Rel_Humidity'] ℹ️ Ordinal Variables/Features: ['Timestamp'] 🎓 2. [Site 2 - Takapuna][numerical_columns_S2, nominal_columns_S2] Summary Statistics of the {site2} Dataframe such as the mean, maximum and minimum values ... ℹ️ Numerical Variables/Features: ['AQI', 'PM10', 'PM2.5', 'NO', 'NO2', 'NOx', 'Wind_Speed', 'Wind_Dir', 'Air_Temp', 'Rel_Humidity'] ℹ️ Ordinal Variables/Features: ['Timestamp']
if IS_DEBUG:
print("\n🎓 [Site1 - Penrose] Summary Statistics of the {site1} rawdata_site1 Dataframe such as the mean, max/minimum values ...")
# rawdata_site1.describe()
🎓 [Site1 - Penrose] Summary Statistics of the {site1} rawdata_site1 Dataframe such as the mean, max/minimum values ...
if IS_DEBUG:
print("\n🎓 [Site2 - Takapuna] Summary Statistics of the {site2} rawdata_site2 Dataframe such as the mean, max/minimum values ...")
# rawdata_site2.describe()
🎓 [Site2 - Takapuna] Summary Statistics of the {site2} rawdata_site2 Dataframe such as the mean, max/minimum values ...
from prettytable import PrettyTable
def display_site_comparison():
"""
This function creates and displays a table that lists various data attributes along with descriptions and
their applicability to different datasets across sites using the PrettyTable library.
"""
## Define the PrettyTable table columns and set column alignments
table = PrettyTable()
table.field_names = ["Variable Name", "Description", "All Sites", "Penrose", "Takapuna"]
table.align["Variable Name"] = "l"
table.align["Description"] = "l"
table.align["All Sites"] = "c"
table.align["Penrose"] = "c"
table.align["Takapuna"] = "c"
## Define the data dictionary with clear structure that containing information about the datasets and placeholders for potential future expansion
data_dictionary = [
("rawdata", "Complete dataset containing all observations across all sites.", "[x]", "[x]", "[x]"),
("numerical_columns_site1", "Numerical columns specific to Site 1.", "[ ]", "[x]", "[ ]"),
("nominal_columns_site1", "Nominal columns specific to Site 1.", "[ ]", "[x]", "[ ]"),
("numerical_columns_site2", "Numerical columns specific to Site 2.", "[ ]", "[ ]", "[x]"),
("nominal_columns_site2", "Nominal columns specific to Site 2.", "[ ]", "[ ]", "[x]"),
("rawdata_site1", "Subset of raw data for Site 1.", "[ ]", "[x]", "[ ]"),
("rawdata_site2", "Subset of raw data for Site 2.", "[ ]", "[ ]", "[x]"),
# ("---------------------------", "---------------------------------------------------------------------", "---------", "-------", "--------"), ## Blank line for separation
# ("cleaned_data", "Cleaned dataset with preprocessing applied.", "[x]", "[x]", "[x]"),
# ("cleaned_ordinal_columns", "Ordinal columns in the cleaned dataset.", "[x]", "[x]", "[x]"),
# ("cleaned_numerical_columns", "Numerical columns in the cleaned dataset.", "[x]", "[x]", "[x]"),
# ("cleaned_nominal_columns", "Nominal columns in the cleaned dataset.", "[x]", "[x]", "[x]"),
# ("cleaned_data1", "Cleaned data for Site 1.", "[ ]", "[x]", "[ ]"),
# ("cleaned_data2", "Cleaned data for Site 2.", "[ ]", "[ ]", "[x]"),
# ("---------------------------", "---------------------------------------------------------------------", "---------", "-------", "--------"), ## Blank line for separation
# ("top_features_PM25_site1", "Top features correlated with PM2.5 at Site 1 (Penrose).", "[ ]", "[x]", "[ ]"),
# ("top_features_PM25_site2", "Top features correlated with PM2.5 at Site 2 (Takapuna).", "[ ]", "[ ]", "[x]"),
# ("top_features_PM10_site1", "Top features correlated with PM10 at Site 1.", "[ ]", "[x]", "[ ]"),
# ("top_features_PM10_site2", "Top features correlated with PM10 at Site 2.", "[ ]", "[ ]", "[x]"),
# ("summary_stats_PM25_penrose", "[PM2.5] Summary statistics for the Penrose site after preprocessing.", "[ ]", "[x]", "[ ]"),
# ("summary_stats_PM25_takapuna", "[PM2.5] Summary statistics for the Takapuna site after preprocessing.", "[ ]", "[ ]", "[x]"),
# ("summary_stats_PM10_penrose", "[PM10] Summary statistics for the Penrose site after preprocessing.", "[ ]", "[x]", "[ ]"),
# ("summary_stats_PM10_takapuna", "[PM10] Summary statistics for the Takapuna site after preprocessing.", "[ ]", "[ ]", "[x]"),
# ("selected_features_PM25_penrose", "[PM2.5] Target + Selected Features for the Penrose site.", "[ ]", "[x]", "[ ]"),
# ("selected_features_PM25_takapuna", "[PM2.5] Target + Selected Features for the Takapuna site.", "[ ]", "[ ]", "[x]"),
# ("selected_features_PM10_penrose", "[PM10] Target + Selected Features for the Penrose site.", "[ ]", "[x]", "[ ]"),
# ("selected_features_PM10_takapuna", "[PM10] Target + Selected Features for the Takapuna site.", "[ ]", "[ ]", "[x]"),
]
## Format for better readability
for entry in data_dictionary:
## Adding rows to the table
table.add_row(entry)
## Print the table in an organized format
print(table)
## Call the function to display the table
print('\n🎓 Listing variables with description...')
display_site_comparison()
🎓 Listing variables with description... +-------------------------+----------------------------------------------------------------+-----------+---------+----------+ | Variable Name | Description | All Sites | Penrose | Takapuna | +-------------------------+----------------------------------------------------------------+-----------+---------+----------+ | rawdata | Complete dataset containing all observations across all sites. | [x] | [x] | [x] | | numerical_columns_site1 | Numerical columns specific to Site 1. | [ ] | [x] | [ ] | | nominal_columns_site1 | Nominal columns specific to Site 1. | [ ] | [x] | [ ] | | numerical_columns_site2 | Numerical columns specific to Site 2. | [ ] | [ ] | [x] | | nominal_columns_site2 | Nominal columns specific to Site 2. | [ ] | [ ] | [x] | | rawdata_site1 | Subset of raw data for Site 1. | [ ] | [x] | [ ] | | rawdata_site2 | Subset of raw data for Site 2. | [ ] | [ ] | [x] | +-------------------------+----------------------------------------------------------------+-----------+---------+----------+
if IS_DEBUG:
## 2. Correlations Heatmap with Regression
DescriptiveStatistics.correlations_heatmap_with_regression(rawdata, numerical_columns, 'Site', corner=True)
DescriptiveStatistics.correlations_heatmap_with_regression(rawdata, numerical_columns, 'Site', corner=False)
## 3. Correlations Heatmap
if IS_DEBUG:
## Filter rawdata for the specified sites
## Copy the rawdata for safety so the original data is not modified
## Set 'Site' and 'Timestamp' as a multi-level index
# rawdata.set_index(['Site', 'Timestamp'], inplace=True)
print("\n🎓 If data is skewed, transform the data (using statsmodels/pmdarima's BoxCoxEndogTransformer or LogEndogTransformer transformation) to stabilise the variance.")
# rawdata_site1 = rawdata.xs('Penrose', level='Site')[numerical_columns_S1]
# numerical_columns_2 = numerical_columns.drop('SO2') ## Removing 'SO2' from the Index
# rawdata_site2 = rawdata.xs('Takapuna', level='Site')[numerical_columns_S1]
correlation_matrix = DescriptiveStatistics.correlations_matrix(rawdata_site1, method='pearson', figsize=(20,15), title="[Site1 - Penrose] Correlation Matrix Heatmap")
correlation_matrix
correlation_matrix = DescriptiveStatistics.correlations_matrix(rawdata_site2, method='pearson', figsize=(20,15), title="[Site2 - Takapuna] Correlation Matrix Heatmap",_rotate=True)
correlation_matrix
🎓 If data is skewed, transform the data (using statsmodels/pmdarima's BoxCoxEndogTransformer or LogEndogTransformer transformation) to stabilise the variance.