import os, logging
## .env --> Setting the environment variable for output format programmatically
os.environ['ENV_PATH'] = 'cleaned_vantage.env' ## Set to `.env_cleaned` or `.env_raw` as needed
# %run -i ./Data_Loading_and_Descriptive_Statistics.ipynb
%run -i ./DataFrameAdapter.ipynb
2024-05-15 23:53:08,507 - INFO - NumExpr defaulting to 2 threads.
🎓 The specified .env file: cleaned_vantage.env +-----------------------------+---------------------------------------------------------------------------------------------+-----------------------+ | Variable | Description | Value | +-----------------------------+---------------------------------------------------------------------------------------------+-----------------------+ | IS_LOADING_FROM_FILES | True if loading data from *.csv/xls files; False if using imported data in Teradata Vantage | True | | IS_TERADATA_VANTAGE | Using scable Teradata Vantage vs. local machine (Laptop) | True | | IS_DATA_IN_TERADATA_VANTAGE | Using TeradataDataFrame in scalable Vantage vs. PandasDataFrame with local *.csv/xls files | False | | SCHEMA_NAME | [Teradata Vantage] Schema Name | Air_Pollution | | TABLE_NAME | [Teradata Vantage] Table Name | Air_Pollution_cleaned | | IS_JUPYTERLAB | Running in JupyterLab vs Python Dash/Vizro Dashboard | True | | IS_TEST_DEV | Is Test/Dev mode is active or not (in Production) | False | | DATA_PATH | *.csv/xls Data PATH | Not set or not found | | USE_DATA_PREFIX | Prefix to use for data files: 'raw' | 'cleaned' | cleaned | +-----------------------------+---------------------------------------------------------------------------------------------+-----------------------+ Performing setup ... Setup complete
... Logon successful Connected as: xxxxxsql://demo_user:xxxxx@host.docker.internal/dbc
2024-05-15 23:53:24,302 - INFO - Engine(teradatasql://demo_user:***@host.docker.internal) 2024-05-15 23:53:24,478 - INFO - data/cleaned/cleaned_Takapuna23-07May2020-to-30Apr2022.csv 2024-05-15 23:53:24,479 - INFO - data/cleaned/cleaned_Penrose7-07May2020-to-30Apr2022.csv 2024-05-15 23:53:24,525 - INFO - ℹ️ Load Data from data/cleaned/cleaned_Penrose7-07May2020-to-30Apr2022.csv file --> rawdata DataFrame 📂
ℹ️ Load Data from data/cleaned folder
2024-05-15 23:53:24,782 - INFO - ℹ️ Load Data from data/cleaned/cleaned_Takapuna23-07May2020-to-30Apr2022.csv file --> rawdata DataFrame 📂
ℹ️ Combined Data Shape: (34734, 13) ℹ️ The Shape of the Dataframe rawdata_site1 (Penrose) and rawdata_site2 (Takapuna): (17375, 12) (17359, 11) 🎓 Describing the types of each attribute as numerical_columns (Continuous), ordinal_columns (Ordinal), or nominal_columns (Nominal) ... ℹ️ Numerical Variables/Features: ['AQI', 'PM10', 'PM2.5', 'SO2', 'NO', 'NO2', 'NOx', 'Wind_Speed', 'Wind_Dir', 'Air_Temp', 'Rel_Humidity'] ℹ️ Ordinal Variables/Features: ['Timestamp'] ℹ️ Nominal Variables/Features: Index(['Site'], dtype='object') 🎓 1. [Site 1 - Penrose][numerical_columns_S1, nominal_columns_S1] Summary Statistics of the Dataframe such as the mean, maximum and minimum values ... ℹ️ Numerical Variables/Features: ['AQI', 'PM10', 'PM2.5', 'SO2', 'NO', 'NO2', 'NOx', 'Wind_Speed', 'Wind_Dir', 'Air_Temp', 'Rel_Humidity'] ℹ️ Ordinal Variables/Features: ['Timestamp'] 🎓 2. [Site 2 - Takapuna][numerical_columns_S2, nominal_columns_S2] Summary Statistics of the {site2} Dataframe such as the mean, maximum and minimum values ... ℹ️ Numerical Variables/Features: ['AQI', 'PM10', 'PM2.5', 'NO', 'NO2', 'NOx', 'Wind_Speed', 'Wind_Dir', 'Air_Temp', 'Rel_Humidity'] ℹ️ Ordinal Variables/Features: ['Timestamp'] 🎓 [Data_Loading_and_Descriptive_Statistics.ipynb] Listing variables with description... +-------------------------+----------------------------------------------------------------+-----------+---------+----------+ | Variable Name | Description | All Sites | Penrose | Takapuna | +-------------------------+----------------------------------------------------------------+-----------+---------+----------+ | rawdata | Complete dataset containing all observations across all sites. | [x] | [x] | [x] | | ordinal_columns | Ordinal columns specific to Site 1. | [x] | [x] | [x] | | numerical_columns_site1 | Numerical columns specific to Site 1. | [ ] | [x] | [ ] | | nominal_columns_site1 | Nominal columns specific to Site 1. | [ ] | [x] | [ ] | | numerical_columns_site2 | Numerical columns specific to Site 2. | [ ] | [ ] | [x] | | nominal_columns_site2 | Nominal columns specific to Site 2. | [ ] | [ ] | [x] | | rawdata_site1 | Subset of raw data for Site 1. | [ ] | [x] | [ ] | | rawdata_site2 | Subset of raw data for Site 2. | [ ] | [ ] | [x] | +-------------------------+----------------------------------------------------------------+-----------+---------+----------+ 🎓 Describing the types of each attribute as cleaned_numerical_columns (Continuous), cleaned_ordinal_columns (Ordinal), or cleaned_nominal_columns (Nominal) ... ℹ️ Numerical Variables/Features: ['AQI', 'PM10', 'PM2.5', 'SO2', 'NO', 'NO2', 'NOx', 'Wind_Speed', 'Wind_Dir', 'Air_Temp', 'Rel_Humidity', 'Hour', 'Day', 'DayOfWeek', 'Month', 'Quarter', 'Year', 'WeekOfYear', 'Season', 'PM2.5_Lag1', 'PM2.5_Lag2', 'PM10_Lag1', 'PM10_Lag2'] ℹ️ Ordinal Variables/Features: ['Timestamp'] ℹ️ Nominal Variables/Features: Index(['Site'], dtype='object') 🎓 [Site1 - Penrose] Summary Statistics of the {site1} cleaned_data_site1 Dataframe such as the mean, max/minimum values ... 🎓 [Site2 - Takapuna] Summary Statistics of the {site2} cleaned_data_site2 Dataframe such as the mean, max/minimum values ... 🎓 [DataFrameAdapter.ipynb] Listing variables with description... +-----------------------------+-----------------------------------------------------------------------+-----------+---------+----------+ | Variable Name | Description | All Sites | Penrose | Takapuna | +-----------------------------+-----------------------------------------------------------------------+-----------+---------+----------+ | rawdata | Complete dataset containing all observations across all sites. | [x] | [x] | [x] | | numerical_columns_site1 | Numerical columns specific to Site 1. | [ ] | [x] | [ ] | | nominal_columns_site1 | Nominal columns specific to Site 1. | [ ] | [x] | [ ] | | numerical_columns_site2 | Numerical columns specific to Site 2. | [ ] | [ ] | [x] | | nominal_columns_site2 | Nominal columns specific to Site 2. | [ ] | [ ] | [x] | | rawdata_site1 | Subset of raw data for Site 1. | [ ] | [x] | [ ] | | rawdata_site2 | Subset of raw data for Site 2. | [ ] | [ ] | [x] | | --------------------------- | --------------------------------------------------------------------- | --------- | ------- | -------- | | cleaned_data | Cleaned dataset with preprocessing applied. | [x] | [x] | [x] | | cleaned_ordinal_columns | Ordinal columns in the cleaned dataset. | [x] | [x] | [x] | | cleaned_numerical_columns | Numerical columns in the cleaned dataset. | [x] | [x] | [x] | | cleaned_nominal_columns | Nominal columns in the cleaned dataset. | [x] | [x] | [x] | | cleaned_data_site1 | Cleaned data for Site 1. | [ ] | [x] | [ ] | | cleaned_data_site2 | Cleaned data for Site 2. | [ ] | [ ] | [x] | +-----------------------------+-----------------------------------------------------------------------+-----------+---------+----------+
rawdata
Timestamp | AQI | PM10 | PM2.5 | SO2 | NO | NO2 | NOx | Wind_Speed | Wind_Dir | Air_Temp | Rel_Humidity | Site | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-05-07 00:00:00 | 26.0 | 16.50 | 16.10 | 1.8 | 60.30 | 40.90000 | 101.20 | 0.60 | 316.0 | 8.0 | 78.00 | Penrose |
1 | 2020-05-07 01:00:00 | 28.0 | 17.70 | 10.10 | 1.0 | 16.00 | 29.20000 | 45.30 | 0.70 | 269.0 | 8.0 | 76.80 | Penrose |
2 | 2020-05-07 02:00:00 | 28.0 | 15.00 | 10.30 | 0.1 | 16.00 | 29.20000 | 45.30 | 1.00 | 180.0 | 8.0 | 78.40 | Penrose |
3 | 2020-05-07 03:00:00 | 29.0 | 14.30 | 11.40 | 0.0 | 11.20 | 27.50000 | 38.70 | 0.80 | 232.0 | 8.0 | 77.50 | Penrose |
4 | 2020-05-07 04:00:00 | 30.0 | 8.80 | 10.60 | -0.1 | 12.00 | 28.50000 | 40.50 | 0.80 | 274.0 | 7.0 | 80.10 | Penrose |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
34729 | 2022-04-30 19:00:00 | 14.0 | 4.75 | 3.30 | NaN | 0.60 | 0.00440 | 5.00 | 2.55 | 109.5 | 16.0 | 76.05 | Takapuna |
34730 | 2022-04-30 20:00:00 | 14.0 | 6.35 | 3.15 | NaN | 0.50 | 0.00365 | 4.15 | 2.45 | 105.5 | 16.0 | 74.50 | Takapuna |
34731 | 2022-04-30 21:00:00 | 14.0 | 6.05 | 2.80 | NaN | 0.40 | 0.00480 | 5.20 | 2.35 | 115.5 | 16.0 | 74.15 | Takapuna |
34732 | 2022-04-30 22:00:00 | 13.0 | 4.20 | 2.60 | NaN | 0.40 | 0.00555 | 5.90 | 1.95 | 122.5 | 16.0 | 75.95 | Takapuna |
34733 | 2022-04-30 23:00:00 | 13.0 | 5.00 | 2.80 | NaN | 0.35 | 0.00405 | 4.30 | 1.95 | 119.0 | 16.0 | 78.00 | Takapuna |
34734 rows × 13 columns
print("\n🎓 [Site1 - Penrose] Summary Statistics of the {site1} rawdata_site1 Dataframe such as the mean, max/minimum values ...")
cleaned_data_site1.describe()
🎓 [Site1 - Penrose] Summary Statistics of the {site1} rawdata_site1 Dataframe such as the mean, max/minimum values ...
Timestamp | AQI | PM10 | PM2.5 | SO2 | NO | NO2 | NOx | Wind_Speed | Wind_Dir | ... | DayOfWeek | Month | Quarter | Year | WeekOfYear | Season | PM2.5_Lag1 | PM2.5_Lag2 | PM10_Lag1 | PM10_Lag2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 17375 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | ... | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 |
mean | 2021-05-03 23:07:19.873381376 | 28.233712 | 13.886201 | 5.400065 | 0.846799 | 9.871754 | 13.973526 | 23.837744 | 2.805975 | 173.193482 | ... | 3.004259 | 6.538935 | 2.513899 | 2020.835568 | 26.883856 | 2.502791 | 5.400681 | 5.401262 | 13.887018 | 13.887496 |
min | 2020-05-07 00:00:00 | 0.000000 | -6.400000 | -12.300000 | -4.400000 | -1.900000 | -1.400000 | -1.300000 | 0.200000 | 1.000000 | ... | 0.000000 | 1.000000 | 1.000000 | 2020.000000 | 1.000000 | 1.000000 | -12.300000 | -12.300000 | -6.400000 | -6.400000 |
25% | 2020-11-03 23:30:00 | 22.000000 | 8.500000 | 2.200000 | 0.100000 | 0.900000 | 4.100000 | 5.300000 | 1.600000 | 74.000000 | ... | 1.000000 | 4.000000 | 2.000000 | 2020.000000 | 13.000000 | 1.000000 | 2.200000 | 2.200000 | 8.500000 | 8.500000 |
50% | 2021-05-03 23:00:00 | 27.000000 | 13.100000 | 4.700000 | 0.500000 | 3.700000 | 10.500000 | 14.800000 | 2.600000 | 210.000000 | ... | 3.000000 | 7.000000 | 3.000000 | 2021.000000 | 27.000000 | 3.000000 | 4.700000 | 4.700000 | 13.100000 | 13.100000 |
75% | 2021-10-31 22:30:00 | 33.000000 | 18.300000 | 7.700000 | 1.000000 | 10.700000 | 20.300000 | 31.100000 | 3.800000 | 235.000000 | ... | 5.000000 | 10.000000 | 4.000000 | 2021.000000 | 40.000000 | 4.000000 | 7.700000 | 7.700000 | 18.300000 | 18.300000 |
max | 2022-04-30 23:00:00 | 121.000000 | 196.700000 | 138.400000 | 25.200000 | 378.400000 | 83.100000 | 455.100000 | 8.200000 | 357.000000 | ... | 6.000000 | 12.000000 | 4.000000 | 2022.000000 | 53.000000 | 4.000000 | 138.400000 | 138.400000 | 196.700000 | 196.700000 |
std | NaN | 9.608861 | 7.958464 | 5.204060 | 1.482960 | 20.526086 | 12.520003 | 30.254179 | 1.533803 | 93.604420 | ... | 1.997634 | 3.459325 | 1.120423 | 0.684702 | 15.234194 | 1.123019 | 5.204694 | 5.205324 | 7.958003 | 7.957911 |
8 rows × 24 columns
print("\n🎓 [Site2 - Takapuna] Summary Statistics of the {site2} rawdata_site2 Dataframe such as the mean, maximum and minimum values ...")
cleaned_data_site2.describe()
🎓 [Site2 - Takapuna] Summary Statistics of the {site2} rawdata_site2 Dataframe such as the mean, maximum and minimum values ...
Timestamp | AQI | PM10 | PM2.5 | SO2 | NO | NO2 | NOx | Wind_Speed | Wind_Dir | ... | DayOfWeek | Month | Quarter | Year | WeekOfYear | Season | PM2.5_Lag1 | PM2.5_Lag2 | PM10_Lag1 | PM10_Lag2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 17359 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.0 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | ... | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 |
mean | 2021-05-04 08:00:00 | 25.152947 | 11.562504 | 6.237181 | 0.5 | 11.637256 | 0.012103 | 23.521597 | 2.497209 | 176.424542 | ... | 3.004148 | 6.540181 | 2.514315 | 2020.836454 | 26.890316 | 2.502275 | 6.237259 | 6.237348 | 11.562559 | 11.562660 |
min | 2020-05-07 17:00:00 | -13.000000 | -3.800000 | 0.450000 | 0.5 | -0.800000 | -0.000500 | -0.700000 | 0.200000 | 5.500000 | ... | 0.000000 | 1.000000 | 1.000000 | 2020.000000 | 1.000000 | 1.000000 | 0.450000 | 0.450000 | -3.800000 | -3.800000 |
25% | 2020-11-04 12:30:00 | 19.000000 | 7.468750 | 4.300000 | 0.5 | 2.350000 | 0.003950 | 6.912500 | 1.250000 | 76.500000 | ... | 1.000000 | 4.000000 | 2.000000 | 2020.000000 | 13.000000 | 1.000000 | 4.300000 | 4.300000 | 7.468750 | 7.468750 |
50% | 2021-05-04 08:00:00 | 24.000000 | 10.650000 | 5.550000 | 0.5 | 5.700000 | 0.008800 | 14.700000 | 2.300000 | 203.500000 | ... | 3.000000 | 7.000000 | 3.000000 | 2021.000000 | 27.000000 | 2.000000 | 5.550000 | 5.550000 | 10.650000 | 10.650000 |
75% | 2021-11-01 03:30:00 | 30.000000 | 14.400000 | 7.300000 | 0.5 | 11.900000 | 0.017100 | 28.500000 | 3.500000 | 260.000000 | ... | 5.000000 | 10.000000 | 4.000000 | 2021.000000 | 40.000000 | 4.000000 | 7.300000 | 7.300000 | 14.400000 | 14.400000 |
max | 2022-04-30 23:00:00 | 318.000000 | 437.250000 | 52.300000 | 0.5 | 352.450000 | 0.072100 | 401.950000 | 7.800000 | 346.500000 | ... | 6.000000 | 12.000000 | 4.000000 | 2022.000000 | 53.000000 | 4.000000 | 52.300000 | 52.300000 | 437.250000 | 437.250000 |
std | NaN | 10.637321 | 7.778661 | 3.401947 | 0.0 | 21.970397 | 0.010751 | 29.903734 | 1.455767 | 92.268417 | ... | 1.998613 | 3.460755 | 1.120882 | 0.684574 | 15.240120 | 1.123436 | 3.401884 | 3.401809 | 7.778618 | 7.778534 |
8 rows × 24 columns
cleaned_data
Timestamp | AQI | PM10 | PM2.5 | SO2 | NO | NO2 | NOx | Wind_Speed | Wind_Dir | ... | DayOfWeek | Month | Quarter | Year | WeekOfYear | Season | PM2.5_Lag1 | PM2.5_Lag2 | PM10_Lag1 | PM10_Lag2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-05-07 00:00:00 | 26.0 | 16.50 | 16.10 | 1.8 | 60.30 | 40.90000 | 101.20 | 0.60 | 316.0 | ... | 3 | 5 | 2 | 2020 | 19 | 3 | 16.10 | 16.10 | 16.50 | 16.50 |
1 | 2020-05-07 01:00:00 | 28.0 | 17.70 | 10.10 | 1.0 | 16.00 | 29.20000 | 45.30 | 0.70 | 269.0 | ... | 3 | 5 | 2 | 2020 | 19 | 3 | 16.10 | 16.10 | 16.50 | 16.50 |
2 | 2020-05-07 02:00:00 | 28.0 | 15.00 | 10.30 | 0.1 | 16.00 | 29.20000 | 45.30 | 1.00 | 180.0 | ... | 3 | 5 | 2 | 2020 | 19 | 3 | 10.10 | 16.10 | 17.70 | 16.50 |
3 | 2020-05-07 03:00:00 | 29.0 | 14.30 | 11.40 | 0.0 | 11.20 | 27.50000 | 38.70 | 0.80 | 232.0 | ... | 3 | 5 | 2 | 2020 | 19 | 3 | 10.30 | 10.10 | 15.00 | 17.70 |
4 | 2020-05-07 04:00:00 | 30.0 | 8.80 | 10.60 | -0.1 | 12.00 | 28.50000 | 40.50 | 0.80 | 274.0 | ... | 3 | 5 | 2 | 2020 | 19 | 3 | 11.40 | 10.30 | 14.30 | 15.00 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
34729 | 2022-04-30 19:00:00 | 14.0 | 4.75 | 3.30 | 0.5 | 0.60 | 0.00440 | 5.00 | 2.55 | 109.5 | ... | 5 | 4 | 2 | 2022 | 17 | 3 | 3.25 | 2.85 | 5.90 | 5.85 |
34730 | 2022-04-30 20:00:00 | 14.0 | 6.35 | 3.15 | 0.5 | 0.50 | 0.00365 | 4.15 | 2.45 | 105.5 | ... | 5 | 4 | 2 | 2022 | 17 | 3 | 3.30 | 3.25 | 4.75 | 5.90 |
34731 | 2022-04-30 21:00:00 | 14.0 | 6.05 | 2.80 | 0.5 | 0.40 | 0.00480 | 5.20 | 2.35 | 115.5 | ... | 5 | 4 | 2 | 2022 | 17 | 3 | 3.15 | 3.30 | 6.35 | 4.75 |
34732 | 2022-04-30 22:00:00 | 13.0 | 4.20 | 2.60 | 0.5 | 0.40 | 0.00555 | 5.90 | 1.95 | 122.5 | ... | 5 | 4 | 2 | 2022 | 17 | 3 | 2.80 | 3.15 | 6.05 | 6.35 |
34733 | 2022-04-30 23:00:00 | 13.0 | 5.00 | 2.80 | 0.5 | 0.35 | 0.00405 | 4.30 | 1.95 | 119.0 | ... | 5 | 4 | 2 | 2022 | 17 | 3 | 2.60 | 2.80 | 4.20 | 6.05 |
34734 rows × 25 columns
def line_plot_time_series_with_slider(data_frame: pd.DataFrame, site_name='Penrose', timestamp_col='Timestamp', y_column='PM2.5', title='Average/Mean Concentration Over Time', freq='M', height=900):
"""
Line plots dynamically analyse data over hours, days, months, and years.
Plot time series data after resampling to a given frequency.
Line Plot for time series data with a Range Slider and Timestamp Selector for dynamic data exploration.
Plot time series data with dynamic threshold lines based on WHO 2021 guidelines.
Parameters:
data_frame (pd.DataFrame): DataFrame containing the time series data.
site_name (str): Name of the site or category of data.
timestamp_col (str): The name of the datetime column.
y_column (str): The name of the column to plot.
title (str): Title of the plot.
freq (str): Frequency code for resampling ('H', 'D', 'M', etc.); default is '1M' for monthly.
Raises:
ValueError: If required columns are not present or datetime conversion fails.
"""
## Dynamic thresholds based on the pollutant and frequency
thresholds = {
'PM2.5': {'D': 15, 'Y': 5}, ## 1D: Daily/24h and 1Y: Annual limits
'PM10': {'D': 45, 'Y': 15}, ## 1D: Daily/24h and 1Y: Annual limits
'NO2': {'D': 25, 'Y': 10}, ## 1D: Daily/24h and 1Y: Annual limits
## Add additional pollutants and thresholds as needed
## TODO: Daily --> a 99th percentile (i.e. 3–4 exceedance days per year).
}
if timestamp_col not in data_frame.columns or y_column not in data_frame.columns:
raise ValueError(f"🪲 {timestamp_col} or {y_column} columns are not in the DataFrame!")
CommmonUtils.ensure_timestamp_index(data_frame, timestamp_col) ## Convert column to datetime and set as index
data_frame_resampled = data_frame[y_column].resample(freq).mean() ## Resample data based on the provided frequency
## Plotly automatically uses the index for the x-axis and the Series values for the y-axis when the input is a Series.
fig = px.line(data_frame_resampled, labels={'value': f'{y_column} (µg/m³)', 'index': timestamp_col},
title=f'[{site_name}] {y_column} {title}')
## Add guideline thresholds if applicable
# pollutant_thresholds = thresholds.get(y_column, {})
# limit = pollutant_thresholds.get(freq)
# if limit:
# fig.add_hline(y=limit, line_dash="dot",
# annotation_text=f"2021 WHO Guideline: {limit} µg/m³",
# annotation_position="bottom right")
# guideline = thresholds.get(y_column, {}).get(freq[1].lower(), None)
# if guideline:
# fig.add_hline(y=guideline, line_dash="dot", annotation_text=f"2021 WHO Guideline: {guideline} µg/m³", annotation_position="bottom right")
## Adding guideline thresholds if applicable
pollutant_thresholds = thresholds.get(y_column, {})
limit = pollutant_thresholds.get(freq)
if limit:
fig.add_hline(y=limit, line_dash="dot", line_color='red',
annotation_text=f"WHO 2021 Guideline (Average of 24 Hours): {limit} µg/m³",
annotation_position="top right")
## Configuring the range slider and buttons for data navigation
fig.update_xaxes(
rangeslider_visible=True,
rangeselector=dict(
buttons=[
{"step": "all", "label": "All"},
{"count": 1, "label": "1 Year", "step": "year", "stepmode": "backward"},
{"count": 1, "label": "YTD", "step": "year", "stepmode": "todate"},
{"count": 6, "label": "6 Months", "step": "month", "stepmode": "backward"},
{"count": 1, "label": "1 Month", "step": "month", "stepmode": "backward"},
{"count": 7, "label": "1 Week", "step": "day", "stepmode": "backward"},
{"count": 1, "label": "1 Day", "step": "day", "stepmode": "backward"},
]
)
)
fig.update_layout(height=height,
# template='plotly_dark', ## Apply dark theme for better visibility
showlegend=False)
fig.show()
def scatter_plot_time_series_by_hour(data_frame: pd.DataFrame, site_name='Penrose', timestamp_col='Timestamp', y_column='PM2.5', title='Average/Mean Concentration by Hour of Day', height=600):
"""
Generates an interactive scatter plot showing data points for each hour of each day.
"""
if timestamp_col not in data_frame.columns or y_column not in data_frame.columns:
raise ValueError(f"{timestamp_col} or {y_column} columns are not in the DataFrame!")
data_frame[timestamp_col] = pd.to_datetime(data_frame[timestamp_col], errors='coerce')
data_frame['Hour'] = data_frame[timestamp_col].dt.hour # Extract hour from datetime
# data_frame['Day'] = data_frame[timestamp_col].dt.date # Extract date for grouping
scatter = [
go.Scatter(
x=data_frame['Hour'],
y=data_frame[y_column],
mode='markers',
marker=dict(
color=data_frame[y_column],
showscale=True,
# colorscale='Viridis', ## Adjusted color scale for better visibility
colorscale = 'OrRd',
colorbar=dict(title=y_column),
size=9,
opacity=0.55
)
)
]
layout = go.Layout(
title=f'[{site_name}] {y_column} {title}',
# xaxis=dict(title='Hour of Day'),
xaxis=dict(
title='Hour of Day',
tickmode='array',
# tickvals=list(range(24)), ## Ensuring every hour is marked
tickvals=list(range(0, 24, 2)), ## Hours from 0 to 23 every 2 hours
# ticktext=[f"{h}:00" for h in range(24)] ## Label hours as '0:00', '1:00', ..., '23:00'
ticktext=[f'{h:02}:00' for h in range(0, 24, 2)] ## Format labels as 'HH:00'
),
yaxis=dict(title=f'{y_column} (µg/m³)'),
height=height,
# template='plotly_dark'
)
fig = go.Figure(data=scatter, layout=layout)
fig.show()
def enhanced_bar_plot(data_frame, site_name='Penrose', group_by_col='Month', y_column='PM2.5', aggregation_func='sum', plot_title='Concentration per Season (Season Aggregation)', x_axis_title='Month', y_axis_title="(µg/m³)", color_scale='OrRd', height=450):
"""
Create a bar plot aggregating data by a specified aggregation_func function.
With enhanced features like dynamic coloring and outlier highlighting.
Parameters:
data_frame (pd.DataFrame): DataFrame containing the data to plot.
group_by_col (str): Column name to group by.
y_column (str): Column name to aggregate.
aggregation_func (function): Aggregation function (np.sum, np.mean, etc.) to apply.
plot_title (str): Title of the plot.
x_axis_title (str): Title for the x-axis.
y_axis_title (str): Title for the y-axis.
color_scale (list or str): Color scale for bars. color_scale='Viridis' | OrRd'
height (int): Height of the plot in pixels.
"""
## Validate column existence
if group_by_col not in data_frame.columns or y_column not in data_frame.columns:
raise ValueError("One or both specified columns are not in the DataFrame.")
## Aggregate the data
aggregated_data = data_frame.groupby(group_by_col).agg({y_column: aggregation_func}).reset_index()
## Ensure proper formatting of the x-axis values
if group_by_col == 'Year':
aggregated_data[group_by_col] = aggregated_data[group_by_col].astype(str)
elif group_by_col == 'Month': ## Formatting for Month names if applicable
aggregated_data[group_by_col] = pd.to_datetime(aggregated_data[group_by_col], format='%m').dt.month_name()
## Create the bar trace
trace = go.Bar(
x=aggregated_data[group_by_col],
y=aggregated_data[y_column],
text=aggregated_data[y_column].apply(lambda x: f'{x:,.2f}'),
marker=dict(color=aggregated_data[y_column], colorscale=color_scale) ## Applying color scale
)
## Set up the layout
layout = go.Layout(
title={'text': plot_title, 'y':0.9, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
xaxis={'title': x_axis_title},
# yaxis={'title': y_axis_title, 'tickformat': ',.2f'},
yaxis=dict(title=y_axis_title, tickformat=',.2f'),
height=height,
margin=dict(l=50, r=50, t=70, b=30), ## Adjust margins to ensure no cutoff
# template='plotly_dark',
)
## Create and display the figure
fig = go.Figure(data=[trace], layout=layout)
## Add median line using horizontal line
q1 = aggregated_data[y_column].quantile(0.25)
# median_value = aggregated_data[y_column].quantile(0.5)
median_value = aggregated_data[y_column].median()
q3 = aggregated_data[y_column].quantile(0.75)
fig.add_hline(y=median_value, line_dash="dot",
annotation_text="Median", annotation_position="top right")
## Add quartile lines
fig.add_hline(y=q1, line_dash="dot", line_color="blue",
annotation_text="Q1 (Quantile 25%)", annotation_position="top left")
# fig.add_hline(y=median_value, line_dash="dot", line_color="green",
# annotation_text="Median", annotation_position="top left")
fig.add_hline(y=q3, line_dash="dot", line_color="red",
annotation_text="Q3 (Quantile 75%)", annotation_position="top left")
iplot(fig)
## Function to visualize hourly trends per day of the week
def scatter_pivot_hourly_trends_by_day(data_frame: pd.DataFrame, site_name='Penrose', timestamp_col='Timestamp', y_column='PM2.5', title='Hourly Trends per Day of the Week', height=450):
"""
Visualize hourly trends per day of the week.
Parameters:
data_frame (pd.DataFrame): DataFrame containing the data to plot.
site_name (str): Name of the site to include in the title.
timestamp_col (str): Column name for the timestamp data.
y_column (str): Column name of the data to plot.
title (str): Title of the plot.
"""
if timestamp_col not in data_frame.columns or y_column not in data_frame.columns:
raise ValueError(f"Missing required columns in DataFrame: {timestamp_col} or {y_column}")
## Ensure the Timestamp column is in datetime format and set it as index if not already
CommmonUtils.ensure_timestamp_index(data_frame, timestamp_col)
## Debug: Check the preparation of necessary columns
if 'DayOfWeekName' not in data_frame.columns:
## Prepare data by extracting day name of the week: Return Monday..Sunday instead of 0..6
data_frame['DayOfWeekName'] = data_frame[timestamp_col].dt.day_name()
## Generate a scatter plot trace for each day of the week with predefined colors
## Light red for Monday, Lighter red for Tuesday, Salmon for Wednesday, Dark salmon for Thursday, Tomato for Friday, Orange red for Saturday, Red for Sunday
# colors = ['#ffcccb', '#fabebe', '#fa8072', '#e9967a', '#ff6347', '#ff4500', '#ff0000']
colors = ['#fabebe', '#fa8072', '#e9967a', '#ff6347', '#ff4500', '#ff0000', '#ffcccb'] ## Color for each day
## Reorder Days of the Week to ensure plots follow the usual weekly order
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
## Pivot Table to get average y_column values for each hour across each day of the week
data_frame_pivot = data_frame.pivot_table(index='Hour', columns='DayOfWeekName', values=y_column, aggfunc='mean')
data_frame_pivot = data_frame_pivot[day_order] ## Reorder columns based on day_order
## Fill any NaN values which might occur due to no data points at specific hours or days
data_frame_pivot.fillna(0, inplace=True)
## Generate plot traces for each day with predefined colors
traces = [go.Scatter(x=data_frame_pivot.index, y=data_frame_pivot[day], mode='lines+markers', name=day, line=dict(color=color))
for day, color in zip(day_order, colors)]
## Plot layout settings
layout = go.Layout(
title={'text': f'[{site_name}] {y_column} {title}', 'y':0.9, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
xaxis={'title': 'Hour of Day'},
yaxis={'title': y_column},
height=height,
# template='plotly_dark'
)
## Create and display the figure
fig = go.Figure(data=traces, layout=layout)
iplot(fig)
def enhanced_seasonal_box_plot(data_frame, site_name='Penrose', season_col='Season', timestamp_col='Hour', y_column='PM2.5', title='Distribution by the Hour in each Season', xtitle='Hour of Day', ytitle='Level (µg/m³)', height=600):
"""
Generate an enhanced box plot visualizing PM2.5/PM10 levels across different hours of the day for each season using dynamic labeling and colors..
Parameters:
data_frame (pd.DataFrame): DataFrame containing the environmental data.
season_col (str): Name of the column in data_frame specifying the season.
timestamp_col (str): Name of the column in data_frame specifying the hour of the day.
y_column (str): Name of the column in data_frame specifying the target feature/variable.
title (str): Title for the plot; defaults to a generic title if None.
xtitle (str): Title for the x-axis.
ytitle (str): Title for the y-axis.
Returns:
None; displays the plot directly.
"""
## Map season codes to names
season_names = {1: 'Spring', 2: 'Summer', 3: 'Autumn', 4: 'Winter'}
if 'SeasonName' not in data_frame.columns:
data_frame['SeasonName'] = data_frame[season_col].map(season_names)
fig = px.box(data_frame, x=timestamp_col, y=y_column, color='SeasonName',
category_orders={season_col: list(season_names.values())},
title=f'[{site_name}] {y_column} {title}',
labels={timestamp_col: xtitle, y_column: ytitle})
fig.update_layout(
# template='plotly_dark'
title={'x': 0.5, 'xanchor': 'center'}, ## Center align title
xaxis=dict(
tickmode='linear',
tick0=0,
dtick=1,
title=xtitle,
titlefont=dict(size=14)
),
yaxis=dict(
tickmode='linear',
tick0=0,
dtick=10,
title=ytitle,
titlefont=dict(size=14)
),
height=height )
fig.show()
## Comprehensive Time Series Visualization Functions
def timeseries_visualization(df1, df2, timestamp_col='Timestamp', target_cols=None, freq='H'):
"""Visualize Site1 (Penrose) and Site2 (Takapuna) data separately."""
for df in [df1, df2]:
if timestamp_col not in df.columns:
raise ValueError(f"🪲 {timestamp_col} is not in the DataFrame!")
for target_col in target_cols:
for df, site_name in zip([df1, df2], ["Site1-Penrose", "Site2-Takapuna"]):
if target_col not in df.columns:
raise ValueError(f"'{target_col}' is not in the DataFrame for {site_name}")
# print(f'\n📈🎓 [EDA.1] Visualize Hourly Environmental Data using Line Plots with a Range Slider and Bar Plots.')
# print(f"\n📈 [EDA.1: ][Line Plot][Time-Series Plot] Average/Mean Hourly Visualizing {target_col} for {site_name}: Line Plot with a Range Slider and Timestamp Selector over hours, days, months, years.")
line_plot_time_series_with_slider(data_frame=df, site_name=site_name, timestamp_col='Timestamp', y_column=target_col, title='Average/Mean Concentration Over Time', freq=freq)
print(f"\n📈 [EDA.2: ][Scatter Plot][Hours of the Day] Average/Mean Hourly Visualizing {target_col} for {site_name}: Scatter Plot Time Series by Hour of Day.")
scatter_plot_time_series_by_hour(data_frame=df, site_name=site_name, timestamp_col='Timestamp', y_column=target_col, title='Average/Mean Concentration by Hour of Day')
print(f"\n📈🎓 [EDA.3: ][Pivot Table][Hour of the Day per Every Day of the Week] Hourly Trends per Every Day of the Week {target_col} for {site_name}")
scatter_pivot_hourly_trends_by_day(data_frame=df, site_name=site_name, timestamp_col='Timestamp', y_column=target_col, title='Hourly Trends per Day of the Week')
print(f"\n📈🎓 [EDA.4: ][Box Plot][Hour of the Day in each Season] {target_col} for {site_name}: Distribution by the Hour in each Season")
# enhanced_seasonal_box_plot(df, season_col, ='Timestamp', y_column, title, xtitle, ytitle):
enhanced_seasonal_box_plot(data_frame=df, site_name=site_name, season_col='Season', timestamp_col='Hour', y_column=target_col, title=f'Distribution by the Hour in each Season', xtitle='Hour of Day', ytitle=f'{target_col} Level (µg/m³)')
print(f"\n📈🎓 [EDA.5: ][Bar Plot] Total Yearly & Monthly Visualizing {target_col} for {site_name}: [Bar Plot] Total Concentration per Year/Month.\n")
## Yearly Aggregation: Total/Sum of PMx Concentration per Year
enhanced_bar_plot(data_frame=df, group_by_col='Year', y_column=target_col, aggregation_func='sum', plot_title=f'[{site_name}] Total {target_col} Concentration per Year (Yearly Aggregation)', x_axis_title='Year', y_axis_title=f"{target_col} (µg/m³)")
## Monthly Aggregation: Total/Sum of PMx Concentration per Month
enhanced_bar_plot(data_frame=df, group_by_col='Month', y_column=target_col, aggregation_func='sum', plot_title=f'[{site_name}] Total {target_col} Concentration per Month (Monthly Aggregation)', x_axis_title='Month', y_axis_title=f"{target_col} (µg/m³)")
## Seasonal Aggregation: Total/Sum of PMx Concentration per Season
enhanced_bar_plot(data_frame=df, group_by_col='Season', y_column=target_col, aggregation_func='sum', plot_title=f'[{site_name}] Total {target_col} Concentration per Season (Season Aggregation)', x_axis_title='Season', y_axis_title=f"{target_col} (µg/m³)")
## cleaned_data_site1 and cleaned_data_site2 are DataFrames containing the data for Penrose and Takapuna sites
target_cols = ['PM2.5', 'PM10'] ## Default target columns to visualize
timeseries_visualization(cleaned_data_site1.copy(), cleaned_data_site2.copy(), timestamp_col='Timestamp', target_cols=target_cols, freq='D')
📈 [EDA.2: ][Scatter Plot][Hours of the Day] Average/Mean Hourly Visualizing PM2.5 for Site1-Penrose: Scatter Plot Time Series by Hour of Day.
📈🎓 [EDA.3: ][Pivot Table][Hour of the Day per Every Day of the Week] Hourly Trends per Every Day of the Week PM2.5 for Site1-Penrose
📈🎓 [EDA.4: ][Box Plot][Hour of the Day in each Season] PM2.5 for Site1-Penrose: Distribution by the Hour in each Season
📈🎓 [EDA.5: ][Bar Plot] Total Yearly & Monthly Visualizing PM2.5 for Site1-Penrose: [Bar Plot] Total Concentration per Year/Month.
📈 [EDA.2: ][Scatter Plot][Hours of the Day] Average/Mean Hourly Visualizing PM2.5 for Site2-Takapuna: Scatter Plot Time Series by Hour of Day.
📈🎓 [EDA.3: ][Pivot Table][Hour of the Day per Every Day of the Week] Hourly Trends per Every Day of the Week PM2.5 for Site2-Takapuna
📈🎓 [EDA.4: ][Box Plot][Hour of the Day in each Season] PM2.5 for Site2-Takapuna: Distribution by the Hour in each Season
📈🎓 [EDA.5: ][Bar Plot] Total Yearly & Monthly Visualizing PM2.5 for Site2-Takapuna: [Bar Plot] Total Concentration per Year/Month.
📈 [EDA.2: ][Scatter Plot][Hours of the Day] Average/Mean Hourly Visualizing PM10 for Site1-Penrose: Scatter Plot Time Series by Hour of Day.
📈🎓 [EDA.3: ][Pivot Table][Hour of the Day per Every Day of the Week] Hourly Trends per Every Day of the Week PM10 for Site1-Penrose
📈🎓 [EDA.4: ][Box Plot][Hour of the Day in each Season] PM10 for Site1-Penrose: Distribution by the Hour in each Season
📈🎓 [EDA.5: ][Bar Plot] Total Yearly & Monthly Visualizing PM10 for Site1-Penrose: [Bar Plot] Total Concentration per Year/Month.
📈 [EDA.2: ][Scatter Plot][Hours of the Day] Average/Mean Hourly Visualizing PM10 for Site2-Takapuna: Scatter Plot Time Series by Hour of Day.
📈🎓 [EDA.3: ][Pivot Table][Hour of the Day per Every Day of the Week] Hourly Trends per Every Day of the Week PM10 for Site2-Takapuna
📈🎓 [EDA.4: ][Box Plot][Hour of the Day in each Season] PM10 for Site2-Takapuna: Distribution by the Hour in each Season
📈🎓 [EDA.5: ][Bar Plot] Total Yearly & Monthly Visualizing PM10 for Site2-Takapuna: [Bar Plot] Total Concentration per Year/Month.
## Analyzing for both sites
top_features_PM25_site1 = DataFrameAdapter.get_top_correlated_features(cleaned_data_site1, 'PM2.5', num_features=12)
top_features_PM25_site2 = DataFrameAdapter.get_top_correlated_features(cleaned_data_site2, 'PM2.5', num_features=12)
print("\n🌟 Top 12 features highly correlated with PM2.5 in Penrose: ", top_features_PM25_site1)
print("\n🌟 Top 12 features highly correlated with PM2.5 in Takapuna: ", top_features_PM25_site2)
🌟 Top 12 features highly correlated with PM2.5 in Penrose: ['PM10', 'PM10_Lag1', 'NOx', 'NO', 'PM10_Lag2', 'AQI', 'NO2', 'SO2', 'Wind_Dir', 'Air_Temp', 'Season', 'Wind_Speed'] 🌟 Top 12 features highly correlated with PM2.5 in Takapuna: ['PM10', 'PM10_Lag1', 'AQI', 'PM10_Lag2', 'NO', 'NOx', 'Air_Temp', 'NO2', 'Rel_Humidity', 'Wind_Dir', 'Quarter', 'Month']
## Analyzing for both sites
top_features_PM10_site1 = DataFrameAdapter.get_top_correlated_features(cleaned_data_site1, 'PM10')
top_features_PM10_site2 = DataFrameAdapter.get_top_correlated_features(cleaned_data_site2, 'PM10')
print("\n🌟Top 12 features highly correlated with PM10 in Penrose: ", top_features_PM10_site1)
print("\n🌟Top 12 features highly correlated with PM10 in Takapuna: ", top_features_PM10_site2)
🌟Top 12 features highly correlated with PM10 in Penrose: ['AQI', 'PM2.5', 'PM2.5_Lag1', 'PM2.5_Lag2', 'NO', 'NOx', 'Wind_Speed', 'Rel_Humidity', 'SO2', 'NO2', 'Air_Temp', 'Wind_Dir'] 🌟Top 12 features highly correlated with PM10 in Takapuna: ['PM2.5', 'PM2.5_Lag1', 'PM2.5_Lag2', 'AQI', 'NO', 'NOx', 'Hour', 'Wind_Speed', 'NO2', 'Wind_Dir', 'Day', 'Rel_Humidity']
## 'cleaned_data_site1' and 'cleaned_data_site2' are your cleaned datasets for Penrose and Takapuna respectively.
## 'top_features_site1' and 'top_features_site2' should be lists of top features determined from correlation analysis.
selected_features_PM25_penrose, summary_stats_PM25_penrose = DataFrameAdapter.compute_summary_statistics(cleaned_data_site1, 'PM2.5' , top_features_PM25_site1)
selected_features_PM25_takapuna, summary_stats_PM25_takapuna = DataFrameAdapter.compute_summary_statistics(cleaned_data_site2, 'PM2.5' , top_features_PM25_site2)
print("\n🎓 [PM2.5] Summary Statistics for Penrose:")
summary_stats_PM25_penrose
🎓 [PM2.5] Summary Statistics for Penrose:
PM2.5 | PM10 | PM10_Lag1 | NOx | NO | PM10_Lag2 | AQI | NO2 | SO2 | Wind_Dir | Air_Temp | Season | Wind_Speed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 |
mean | 5.400065 | 13.886201 | 13.887018 | 23.837744 | 9.871754 | 13.887496 | 28.233712 | 13.973526 | 0.846799 | 173.193482 | 16.744259 | 2.502791 | 2.805975 |
std | 5.204060 | 7.958464 | 7.958003 | 30.254179 | 20.526086 | 7.957911 | 9.608861 | 12.520003 | 1.482960 | 93.604420 | 4.005293 | 1.123019 | 1.533803 |
min | -12.300000 | -6.400000 | -6.400000 | -1.300000 | -1.900000 | -6.400000 | 0.000000 | -1.400000 | -4.400000 | 1.000000 | 3.000000 | 1.000000 | 0.200000 |
25% | 2.200000 | 8.500000 | 8.500000 | 5.300000 | 0.900000 | 8.500000 | 22.000000 | 4.100000 | 0.100000 | 74.000000 | 14.000000 | 1.000000 | 1.600000 |
50% | 4.700000 | 13.100000 | 13.100000 | 14.800000 | 3.700000 | 13.100000 | 27.000000 | 10.500000 | 0.500000 | 210.000000 | 17.000000 | 3.000000 | 2.600000 |
75% | 7.700000 | 18.300000 | 18.300000 | 31.100000 | 10.700000 | 18.300000 | 33.000000 | 20.300000 | 1.000000 | 235.000000 | 20.000000 | 4.000000 | 3.800000 |
max | 138.400000 | 196.700000 | 196.700000 | 455.100000 | 378.400000 | 196.700000 | 121.000000 | 83.100000 | 25.200000 | 357.000000 | 28.000000 | 4.000000 | 8.200000 |
print("🎓 [PM2.5] Summary Statistics for Takapuna:")
summary_stats_PM25_takapuna
🎓 [PM2.5] Summary Statistics for Takapuna:
PM2.5 | PM10 | PM10_Lag1 | AQI | PM10_Lag2 | NO | NOx | Air_Temp | NO2 | Rel_Humidity | Wind_Dir | Quarter | Month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 |
mean | 6.237181 | 11.562504 | 11.562559 | 25.152947 | 11.562660 | 11.637256 | 23.521597 | 16.342980 | 0.012103 | 69.842651 | 176.424542 | 2.514315 | 6.540181 |
std | 3.401947 | 7.778661 | 7.778618 | 10.637321 | 7.778534 | 21.970397 | 29.903734 | 3.952319 | 0.010751 | 11.760956 | 92.268417 | 1.120882 | 3.460755 |
min | 0.450000 | -3.800000 | -3.800000 | -13.000000 | -3.800000 | -0.800000 | -0.700000 | 3.500000 | -0.000500 | 30.400000 | 5.500000 | 1.000000 | 1.000000 |
25% | 4.300000 | 7.468750 | 7.468750 | 19.000000 | 7.468750 | 2.350000 | 6.912500 | 13.500000 | 0.003950 | 61.200000 | 76.500000 | 2.000000 | 4.000000 |
50% | 5.550000 | 10.650000 | 10.650000 | 24.000000 | 10.650000 | 5.700000 | 14.700000 | 16.000000 | 0.008800 | 70.950000 | 203.500000 | 3.000000 | 7.000000 |
75% | 7.300000 | 14.400000 | 14.400000 | 30.000000 | 14.400000 | 11.900000 | 28.500000 | 19.500000 | 0.017100 | 79.700000 | 260.000000 | 4.000000 | 10.000000 |
max | 52.300000 | 437.250000 | 437.250000 | 318.000000 | 437.250000 | 352.450000 | 401.950000 | 27.000000 | 0.072100 | 90.350000 | 346.500000 | 4.000000 | 12.000000 |
## 'cleaned_data_site1' and 'cleaned_data_site2' are your cleaned datasets for Penrose and Takapuna respectively.
## 'top_features_site1' and 'top_features_site2' should be lists of top features determined from correlation analysis.
selected_features_PM10_penrose, summary_stats_PM10_penrose = DataFrameAdapter.compute_summary_statistics(cleaned_data_site1, 'PM10' , top_features_PM10_site1)
selected_features_PM10_takapuna, summary_stats_PM10_takapuna = DataFrameAdapter.compute_summary_statistics(cleaned_data_site2, 'PM10' , top_features_PM10_site2)
print("\n🎓 [PM10] Summary Statistics for Penrose:")
summary_stats_PM10_penrose
🎓 [PM10] Summary Statistics for Penrose:
PM10 | AQI | PM2.5 | PM2.5_Lag1 | PM2.5_Lag2 | NO | NOx | Wind_Speed | Rel_Humidity | SO2 | NO2 | Air_Temp | Wind_Dir | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 | 17375.000000 |
mean | 13.886201 | 28.233712 | 5.400065 | 5.400681 | 5.401262 | 9.871754 | 23.837744 | 2.805975 | 69.415771 | 0.846799 | 13.973526 | 16.744259 | 173.193482 |
std | 7.958464 | 9.608861 | 5.204060 | 5.204694 | 5.205324 | 20.526086 | 30.254179 | 1.533803 | 12.287341 | 1.482960 | 12.520003 | 4.005293 | 93.604420 |
min | -6.400000 | 0.000000 | -12.300000 | -12.300000 | -12.300000 | -1.900000 | -1.300000 | 0.200000 | 26.900000 | -4.400000 | -1.400000 | 3.000000 | 1.000000 |
25% | 8.500000 | 22.000000 | 2.200000 | 2.200000 | 2.200000 | 0.900000 | 5.300000 | 1.600000 | 60.200000 | 0.100000 | 4.100000 | 14.000000 | 74.000000 |
50% | 13.100000 | 27.000000 | 4.700000 | 4.700000 | 4.700000 | 3.700000 | 14.800000 | 2.600000 | 70.400000 | 0.500000 | 10.500000 | 17.000000 | 210.000000 |
75% | 18.300000 | 33.000000 | 7.700000 | 7.700000 | 7.700000 | 10.700000 | 31.100000 | 3.800000 | 79.500000 | 1.000000 | 20.300000 | 20.000000 | 235.000000 |
max | 196.700000 | 121.000000 | 138.400000 | 138.400000 | 138.400000 | 378.400000 | 455.100000 | 8.200000 | 92.000000 | 25.200000 | 83.100000 | 28.000000 | 357.000000 |
print("🎓 [PM10] Summary Statistics for Takapuna:")
summary_stats_PM10_takapuna
🎓 [PM10] Summary Statistics for Takapuna:
PM10 | PM2.5 | PM2.5_Lag1 | PM2.5_Lag2 | AQI | NO | NOx | Hour | Wind_Speed | NO2 | Wind_Dir | Day | Rel_Humidity | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 | 17359.000000 |
mean | 11.562504 | 6.237181 | 6.237259 | 6.237348 | 25.152947 | 11.637256 | 23.521597 | 11.503428 | 2.497209 | 0.012103 | 176.424542 | 15.830463 | 69.842651 |
std | 7.778661 | 3.401947 | 3.401884 | 3.401809 | 10.637321 | 21.970397 | 29.903734 | 6.923210 | 1.455767 | 0.010751 | 92.268417 | 8.760502 | 11.760956 |
min | -3.800000 | 0.450000 | 0.450000 | 0.450000 | -13.000000 | -0.800000 | -0.700000 | 0.000000 | 0.200000 | -0.000500 | 5.500000 | 1.000000 | 30.400000 |
25% | 7.468750 | 4.300000 | 4.300000 | 4.300000 | 19.000000 | 2.350000 | 6.912500 | 6.000000 | 1.250000 | 0.003950 | 76.500000 | 8.000000 | 61.200000 |
50% | 10.650000 | 5.550000 | 5.550000 | 5.550000 | 24.000000 | 5.700000 | 14.700000 | 12.000000 | 2.300000 | 0.008800 | 203.500000 | 16.000000 | 70.950000 |
75% | 14.400000 | 7.300000 | 7.300000 | 7.300000 | 30.000000 | 11.900000 | 28.500000 | 18.000000 | 3.500000 | 0.017100 | 260.000000 | 23.000000 | 79.700000 |
max | 437.250000 | 52.300000 | 52.300000 | 52.300000 | 318.000000 | 352.450000 | 401.950000 | 23.000000 | 7.800000 | 0.072100 | 346.500000 | 31.000000 | 90.350000 |
def determine_seasonal_period(freq='H', seasonality='Daily'):
"""
Determines the seasonal period 'p' based on the data frequency and expected seasonality.
Parameters:
- freq (str): Frequency of the data points ('H' for hourly, 'D' for daily, etc.).
- seasonality (str): Expected seasonality ('daily', 'weekly', 'monthly', etc.).
Returns:
- p (int): The seasonal period.
"""
## Ensure that freq is in upper case to handle different capitalizations
freq = freq.lower()
seasonality = seasonality.lower()
if freq.endswith('h'):
if seasonality == 'daily':
return 24 ## Hourly data and the expectation is that patterns repeat every day, p = 24 (since there are 24 hours in a day).
elif seasonality == 'weekly':
return 24 * 7 ## If patterns repeat on a weekly basis, 24 hours in a day and 7 days in a week
elif seasonality == 'monthly':
return 730 ## Approximation p = int(24 * 30.4375) --> 730
elif seasonality == 'quarterly':
return 2190 ## Approximation int(24 * 30.4375 * 3)
elif seasonality == 'seasons':
return 2190 ## FIXME: Four seasons in a year int(24 * 30.4375 * 3)
elif seasonality == 'yearly':
return 24 * 365 # Non-leap year
elif freq.endswith('d'):
if seasonality == 'daily':
return 1 ## For daily data, if you're looking for daily seasonality, it doesn't make sense - each day is its own season.
elif seasonality == 'weekly':
return 7 ## There are 7 days in a week, so the seasonality period is 7.
elif seasonality == 'monthly':
return 30 ## Approximately, a month has 30 days.
elif seasonality == 'quarterly':
return 91 ## Approximately, a quarter has 3 months, each with about 30 days.
elif seasonality == 'seasons':
return 91 ## FIXME: In New Zealand, seasons are roughly three months long, like quarters.
elif seasonality == 'yearly':
return 365 ## There are 365 days in a year (366 for leap years).
## Extend logic for other frequencies if needed
raise ValueError(f"Unsupported frequency-seasonality combination: {freq}-{seasonality}")
def seasonal_decomposition_and_stationarity(data, site, variables, freq='1d', seasonality='daily'):
"""
Performs seasonal decomposition and stationarity analysis on selected variables for a specified site.
Parameters:
data (DataFrame): The dataset containing air quality and meteorological data.
site (str): Site identifier to filter the data.
variables (dict): Variables to analyze with their descriptive names.
Returns:
None
"""
## Filter data for the specified site
# site_data = data[data['Site'] == site]
## Resample data by day and calculate mean for each day
resampled_data = CommmonUtils.resample_data(data, freq=freq)
## Determine the seasonal period
period = determine_seasonal_period(freq=freq, seasonality=seasonality)
# Iterate over each variable to perform analysis
for var, description in variables.items():
if var in resampled_data.columns:
## Determine the period dynamically based on frequency, if possible, for Hourly Data
## Natural seasonal cycle of a year, and our data points are monthly, then the period would be 12 (for 12 months in a year).
# if freq.endswith('h'):
# period = 24 ## Hourly Data with Daily Seasonality
# elif freq.endswith('d'):
# period = 365 ## 365 days in a year, although this may be adjusted for leap years or if using a business calendar
# elif freq.endswith('w'):
# period = 52 ## approximately 52 weeks in a year
# elif freq.endswith('m'):
# period = 12 ## yearly seasonality for monthly data - for 12 months in a year
# elif freq.endswith('q'):
# period = 4 ## 4 quarters in a year
# elif freq.endswith('y'):
# period = 1 ## If each data point represents a year
# else:
# period = len(ts_data) // 2 ## Default to half the length of the dataset
## Perform seasonal decomposition
decomposition = sm.tsa.seasonal_decompose(resampled_data[var], model='additive', period=period)
# decomposition = sm.tsa.seasonal_decompose(resampled_data[var], model='additive')
# Plot the decomposition results
fig, axs = plt.subplots(2, 2, figsize=(14, 8))
decomposition.trend.plot(ax=axs[0, 0], title='Trend')
decomposition.seasonal.plot(ax=axs[0, 1], title='Seasonal')
decomposition.resid.plot(ax=axs[1, 0], title='Residual')
decomposition.observed.plot(ax=axs[1, 1], title='Observed')
# Perform Dickey-Fuller test to check stationarity
df_test = adfuller(resampled_data[var].dropna())
df_result = f"Dickey-Fuller (ADF) test: p-value = {df_test[1]:.5f}"
# p_value = df_test[1]
# plt.suptitle(f'{description} Analysis for {site}\nDickey-Fuller Test: p-value = {p_value:.5f}', fontsize=16)
# plt.tight_layout(rect=[0, 0, 1, 0.96]) # Adjust layout
## Plot main-title and subtitle (Dickey-Fuller test results)
plt.suptitle(f'{description} Analysis - {site}', fontsize=16)
plt.figtext(0.5, 0.93, df_result, ha='center', fontsize=12, color='green') ## Subtitle below the main title
for ax in axs.flat:
ax.set_xlabel('Date')
ax.set_ylabel('Value')
plt.tight_layout(rect=[0, 0, 1, 0.96]) # Adjust layout to make room for title
plt.show()
else:
print(f"Warning: {var} not found in the dataset.")
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
## Filter rawdata for the specified sites
## Copy the rawdata for safety so the original data is not modified
data_series_df = rawdata.copy()
# ## Set 'Site' and 'Timestamp' as a multi-level index
# data_series_df.set_index(['Site', 'Timestamp'], inplace=True)
# ## TODO: log-scale ?
# # data_site1 = rawdata.loc[(site1, slice(None)), numerical_columns]
# pd_site1 = data_series_df.xs('Penrose', level='Site')[numerical_columns]
# pd_site2 = data_series_df.xs('Takapuna', level='Site')[numerical_columns]
data_series_df
Timestamp | AQI | PM10 | PM2.5 | SO2 | NO | NO2 | NOx | Wind_Speed | Wind_Dir | Air_Temp | Rel_Humidity | Site | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-05-07 00:00:00 | 26.0 | 16.50 | 16.10 | 1.8 | 60.30 | 40.90000 | 101.20 | 0.60 | 316.0 | 8.0 | 78.00 | Penrose |
1 | 2020-05-07 01:00:00 | 28.0 | 17.70 | 10.10 | 1.0 | 16.00 | 29.20000 | 45.30 | 0.70 | 269.0 | 8.0 | 76.80 | Penrose |
2 | 2020-05-07 02:00:00 | 28.0 | 15.00 | 10.30 | 0.1 | 16.00 | 29.20000 | 45.30 | 1.00 | 180.0 | 8.0 | 78.40 | Penrose |
3 | 2020-05-07 03:00:00 | 29.0 | 14.30 | 11.40 | 0.0 | 11.20 | 27.50000 | 38.70 | 0.80 | 232.0 | 8.0 | 77.50 | Penrose |
4 | 2020-05-07 04:00:00 | 30.0 | 8.80 | 10.60 | -0.1 | 12.00 | 28.50000 | 40.50 | 0.80 | 274.0 | 7.0 | 80.10 | Penrose |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
34729 | 2022-04-30 19:00:00 | 14.0 | 4.75 | 3.30 | NaN | 0.60 | 0.00440 | 5.00 | 2.55 | 109.5 | 16.0 | 76.05 | Takapuna |
34730 | 2022-04-30 20:00:00 | 14.0 | 6.35 | 3.15 | NaN | 0.50 | 0.00365 | 4.15 | 2.45 | 105.5 | 16.0 | 74.50 | Takapuna |
34731 | 2022-04-30 21:00:00 | 14.0 | 6.05 | 2.80 | NaN | 0.40 | 0.00480 | 5.20 | 2.35 | 115.5 | 16.0 | 74.15 | Takapuna |
34732 | 2022-04-30 22:00:00 | 13.0 | 4.20 | 2.60 | NaN | 0.40 | 0.00555 | 5.90 | 1.95 | 122.5 | 16.0 | 75.95 | Takapuna |
34733 | 2022-04-30 23:00:00 | 13.0 | 5.00 | 2.80 | NaN | 0.35 | 0.00405 | 4.30 | 1.95 | 119.0 | 16.0 | 78.00 | Takapuna |
34734 rows × 13 columns
import vizro.models as vm
import vizro.plotly.express as px
from vizro import Vizro
from vizro.tables import dash_ag_grid
2024-05-16 00:07:21,506 - INFO - Overwriting plotly default template with vizro chart template. To revert back to the default template, run `import plotly.io as pio; pio.templates.default = 'plotly'`.
Vizro._reset()
feature = 'PM2.5'
ds='Timestamp'
x='NO'
figure = px.scatter(data_series_df, x=ds, y=feature, color='Site', title='[Line Chart] Features Visualization across Locations')
# Make sure 'AQI' values are positive by taking the absolute value.
# Additionally, ensure that 'AQI' values are scaled to a visible range for the marker size.
data_series_df['AQI_positive'] = data_series_df['AQI'].abs()
data_series_df['AQI_scaled'] = (data_series_df['AQI_positive'] - data_series_df['AQI_positive'].min()) / (data_series_df['AQI_positive'].max() - data_series_df['AQI_positive'].min()) * 100
# page = vm.Page(
# title="Time Series EDA",
# components=[
# vm.Graph(
# id="scatter_chart",
# figure=px.scatter(data_series_df, x=ds, y=feature, color='Site', title='[Line Chart] Features Visualization across Locations'),
# ),
# vm.AgGrid(title="Title - AG Grid", figure=dash_ag_grid(data_frame=data_series_df)),
# vm.Graph(
# figure=px.scatter(
# data_series_df,
# x=x,
# y=feature,
# color='Site',
# marginal_y="violin",
# marginal_x="box",
# title="Container - Scatter",
# )
# ),
# ],
# controls=[
# vm.Filter(column='Site', selector=vm.Dropdown(value=['ALL'])),
# ],
# )
tab_1 = vm.Container(
title="Dataset",
components=[
vm.AgGrid(title="Title - AG Grid", figure=dash_ag_grid(data_frame=data_series_df)),
],
)
tab_2 = vm.Container(
title="Line Chart",
components=[
vm.Graph(
id="scatter_chart",
figure=figure,
),
],
)
tab_3 = vm.Container(
title="Correlation",
components=[
vm.Graph(
figure=px.scatter(
data_series_df,
x=x,
y=feature,
color='Site',
marginal_y="violin",
marginal_x="box",
title="Container - Scatter",
)
),
],
)
tab_4 = vm.Container(
title="Tab III",
components=[
vm.Graph(
figure=px.scatter(
data_series_df,
title="Graph 4",
x=x,
y=feature,
size='AQI_scaled',
color='Site',
),
),
],
)
page = vm.Page(title="Time Series EDA",
components=[vm.Tabs(tabs=[tab_1, tab_2, tab_3, tab_4])],
controls=[vm.Filter(column='Site', selector=vm.Dropdown(value=['ALL']))],)
dashboard = vm.Dashboard(pages=[page])
IS_JUPYTERLAB = True
if IS_JUPYTERLAB:
Vizro(assets_folder="assets").build(dashboard).run(port=8008)
else:
app = Vizro().build(dashboard)
server = app.dash.server
if __name__ == "__main__":
app.run(port=8008)
def plot_selected_features_plotly(data, sites, numerical_columns, selected_features):
"""
Plots selected numerical features for specified sites using Plotly, allowing toggling visibility of features,
with dynamic adjustment of plot height based on the number of legend items.
Parameters:
- data (DataFrame): The dataset containing 'Timestamp', 'Site', and numerical columns.
- sites (list): List of sites to be visualized.
- numerical_columns (list): List of all numerical columns available for plotting.
- selected_features (list): List of features to be initially visible on the plot.
Returns:
- Plotly figure with the ability to toggle visibility of non-selected features and dynamically adjusted height.
"""
## Create an empty list to store traces
traces = []
## Iterate through each site
for site in sites:
## Filter the data for the current site
site_data = data[data['Site'] == site]
## Iterate through each numerical feature
for feature in numerical_columns:
# Check if the feature should be initially visible
is_visible = feature in selected_features
## Create a scatter plot for the current feature
trace = go.Scatter(
x=site_data['Timestamp'],
y=site_data[feature],
mode='lines+markers',
name=f"{site} - {feature}",
text=site_data[feature],
visible='legendonly' if not is_visible else True ## Toggle visibility based on selection
)
traces.append(trace)
## Calculate the dynamic height: base height + additional space per legend item
base_height = 400
additional_height_per_item = 20
total_legend_items = len(sites) * len(numerical_columns)
dynamic_height = base_height + (additional_height_per_item * total_legend_items)
## Define the layout of the plot
layout = go.Layout(
title='Selected Features Visualization Across Sites',
xaxis=dict(title='Timestamp'),
yaxis=dict(title='Measurement Values'),
legend=dict(title='Site - Feature'),
hovermode='closest',
height=dynamic_height # Use the dynamically calculated height
)
## Create the figure with traces and layout
fig = go.Figure(data=traces, layout=layout)
## Enhance the figure by customizing axis labels, titles, and enabling the legend
fig.update_layout(title_text='Selected Air Quality Features Across Sites')
return fig
sites = ['Penrose', 'Takapuna']
numerical_columns = ['AQI', 'PM10', 'PM2.5', 'SO2', 'NO', 'NO2', 'NOx', 'Wind_Speed', 'Wind_Dir', 'Air_Temp', 'Rel_Humidity']
selected_features = ['PM2.5'] ## Initially visible features
fig = plot_selected_features_plotly(data_series_df, sites, numerical_columns, selected_features)
## Show the plot
fig.show()