Spaces:

Mihkelmj
/

utrecht-pollution-prediction

Sleeping

App Files Files Community

elisaklunder commited on Oct 24, 2024

Commit

1d3c9ee

1 Parent(s): e3ae012

data pipelines

Browse files

Files changed (13) hide show

app.py +51 -29
data_api_calls.py +0 -191
pages/admin.py +61 -6
pollution_data.csv +9 -0
python.py +0 -3
src/data_api_calls.py +181 -0
src/{data_loading.py → features_pipeline.py} +18 -58
past_data_api_calls.py → src/past_data_api_calls copy.py +77 -87
src/past_data_api_calls.py +140 -0
src/{models_loading.py → predict.py} +19 -7
test.ipynb +40 -11
test.py +0 -3
weather_data.csv +9 -0

app.py CHANGED Viewed

@@ -3,9 +3,8 @@ import pandas as pd
 import plotly.graph_objects as go
 import streamlit as st
-from data_api_calls import get_data
 from src.helper_functions import custom_metric_box, pollution_box
-from src.models_loading import run_model
 st.set_page_config(
     page_title="Utrecht Pollution Dashboard",
@@ -16,33 +15,24 @@ st.set_page_config(
 alt.themes.enable("dark")
-dataset = get_data()
-today = dataset.iloc[-1]
-previous_day = dataset.iloc[-2]
-prediction = run_model("O3", data=dataset)
-pred1 = prediction[0][0]
-pred2 = prediction[0][1]
-pred3 = prediction[0][2]
 dates_past = pd.date_range(end=pd.Timestamp.today(), periods=8).to_list()
-dates_future = pd.date_range(start=pd.Timestamp.today() + pd.Timedelta(days=1), periods=3).to_list()
 # O3 and NO2 values for the past 7 days
-o3_past_values = dataset["O3"]
-no2_past_values = dataset["NO2"]
-# Predicted O3 and NO2 values for the next 3 days (convert to pandas Series)
-o3_future_values = pd.Series(prediction[0].flatten())  # Flatten the array to 1D
-no2_future_values = pd.Series([26, 27, 28])  # Example prediction data
-# Combine the past and future values using pd.concat
 o3_values = pd.concat([o3_past_values, o3_future_values], ignore_index=True)
 no2_values = pd.concat([no2_past_values, no2_future_values], ignore_index=True)
-# Combine dates and values
 dates = dates_past + dates_future
-# Create a DataFrame
 df = pd.DataFrame({"Date": dates, "O3": o3_values, "NO2": no2_values})
@@ -55,13 +45,37 @@ with col1:
     st.subheader("Current Weather")
     subcol1, subcol2 = st.columns((1, 1))
     with subcol1:
-        custom_metric_box(label="Temperature", value=f"{round(today['mean_temp'] * 0.1)} °C", delta=f"{round(today['mean_temp'] * 0.1) - round(previous_day['mean_temp'] * 0.1)} °C")
-        custom_metric_box(label="Humidity", value=f"{round(today['humidity'])} %", delta=f"{round(today['humidity']) - round(previous_day['humidity'])} %")
-        custom_metric_box(label="Pressure", value=f"{round(today['pressure'] * 0.1)} hPa", delta=f"{round(today['pressure'] * 0.1) - round(previous_day['pressure'] * 0.1)} hPa")
     with subcol2:
-        custom_metric_box(label="Precipitation", value=f"{round(today['percipitation'] * 0.1)} mm", delta=f"{round(today['percipitation'] * 0.1) - round(previous_day['percipitation'] * 0.1)} mm")
-        custom_metric_box(label="Solar Radiation", value=f"{round(today['global_radiation'])} J/m²", delta=f"{round(today['global_radiation']) - round(previous_day['global_radiation'])} J/m²")
-        custom_metric_box(label="Wind Speed", value=f"{round(today['wind_speed'] * 0.1, 1)} m/s", delta=f"{round(today['wind_speed'] * 0.1, 1) - round(previous_day['wind_speed'] * 0.1, 1)} m/s")
 with col2:
     st.subheader("Current Pollution Levels")
@@ -69,14 +83,22 @@ with col2:
     # Display the prediction
     # st.write(f'Predicted Pollution Level: {prediction[0]:.2f}')
     with sub1:
-        pollution_box(label="O<sub>3</sub>", value=f"{round(today['O3'])} µg/m³", delta=f"{round(int(today['O3']) - int(previous_day['O3']))} µg/m³")
         with st.expander("Learn more about O3", expanded=False):
             st.markdown(
                 "*Ozone (O<sub>3</sub>)*: A harmful gas at ground level, contributing to respiratory issues and aggravating asthma.",
                 unsafe_allow_html=True,
             )
     with sub2:
-        pollution_box(label="NO<sub>2</sub>", value=f"{round(today['NO2'])} µg/m³", delta=f"{round(int(today['NO2']) - int(previous_day['NO2']))} µg/m³")
         with st.expander("Learn more about O3", expanded=False):
             st.markdown(
                 "*Wadeva particle (NO<sub>2</sub>)*: A harmful gas at ground level, contributing to respiratory issues and aggravating asthma.",

 import plotly.graph_objects as go
 import streamlit as st
 from src.helper_functions import custom_metric_box, pollution_box
+from src.predict import get_data_and_predictions
 st.set_page_config(
     page_title="Utrecht Pollution Dashboard",
 alt.themes.enable("dark")
+week_data, predictions_O3, predictions_NO2 = get_data_and_predictions()
+today = week_data.iloc[-1]
+previous_day = week_data.iloc[-2]
 dates_past = pd.date_range(end=pd.Timestamp.today(), periods=8).to_list()
+dates_future = pd.date_range(
+    start=pd.Timestamp.today() + pd.Timedelta(days=1), periods=3
+).to_list()
 # O3 and NO2 values for the past 7 days
+o3_past_values = week_data["O3"]
+no2_past_values = week_data["NO2"]
+o3_future_values = pd.Series(predictions_O3[0].flatten())
+no2_future_values = pd.Series(predictions_NO2[0].flatten())
 o3_values = pd.concat([o3_past_values, o3_future_values], ignore_index=True)
 no2_values = pd.concat([no2_past_values, no2_future_values], ignore_index=True)
 dates = dates_past + dates_future
 df = pd.DataFrame({"Date": dates, "O3": o3_values, "NO2": no2_values})
     st.subheader("Current Weather")
     subcol1, subcol2 = st.columns((1, 1))
     with subcol1:
+        custom_metric_box(
+            label="Temperature",
+            value=f"{round(today['mean_temp'] * 0.1)} °C",
+            delta=f"{round(today['mean_temp'] * 0.1) - round(previous_day['mean_temp'] * 0.1)} °C",
+        )
+        custom_metric_box(
+            label="Humidity",
+            value=f"{round(today['humidity'])} %",
+            delta=f"{round(today['humidity']) - round(previous_day['humidity'])} %",
+        )
+        custom_metric_box(
+            label="Pressure",
+            value=f"{round(today['pressure'] * 0.1)} hPa",
+            delta=f"{round(today['pressure'] * 0.1) - round(previous_day['pressure'] * 0.1)} hPa",
+        )
     with subcol2:
+        custom_metric_box(
+            label="Precipitation",
+            value=f"{round(today['percipitation'] * 0.1)} mm",
+            delta=f"{round(today['percipitation'] * 0.1) - round(previous_day['percipitation'] * 0.1)} mm",
+        )
+        custom_metric_box(
+            label="Solar Radiation",
+            value=f"{round(today['global_radiation'])} J/m²",
+            delta=f"{round(today['global_radiation']) - round(previous_day['global_radiation'])} J/m²",
+        )
+        custom_metric_box(
+            label="Wind Speed",
+            value=f"{round(today['wind_speed'] * 0.1, 1)} m/s",
+            delta=f"{round(today['wind_speed'] * 0.1, 1) - round(previous_day['wind_speed'] * 0.1, 1)} m/s",
+        )
 with col2:
     st.subheader("Current Pollution Levels")
     # Display the prediction
     # st.write(f'Predicted Pollution Level: {prediction[0]:.2f}')
     with sub1:
+        pollution_box(
+            label="O<sub>3</sub>",
+            value=f"{round(today['O3'])} µg/m³",
+            delta=f"{round(int(today['O3']) - int(previous_day['O3']))} µg/m³",
+        )
         with st.expander("Learn more about O3", expanded=False):
             st.markdown(
                 "*Ozone (O<sub>3</sub>)*: A harmful gas at ground level, contributing to respiratory issues and aggravating asthma.",
                 unsafe_allow_html=True,
             )
     with sub2:
+        pollution_box(
+            label="NO<sub>2</sub>",
+            value=f"{round(today['NO2'])} µg/m³",
+            delta=f"{round(int(today['NO2']) - int(previous_day['NO2']))} µg/m³",
+        )
         with st.expander("Learn more about O3", expanded=False):
             st.markdown(
                 "*Wadeva particle (NO<sub>2</sub>)*: A harmful gas at ground level, contributing to respiratory issues and aggravating asthma.",

data_api_calls.py DELETED Viewed

@@ -1,191 +0,0 @@
-import codecs
-import csv
-import http.client
-import os
-import re
-import sys
-import urllib.request
-from datetime import date, timedelta
-from io import StringIO
-import pandas as pd
-def pollution_data():
-    particles = ["NO2", "O3"]
-    stations = ["NL10636", "NL10639", "NL10643"]
-    all_dataframes = []
-    today = date.today().isoformat() + "T09:00:00Z"
-    yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
-    latest_date = (date.today() - timedelta(8)).isoformat() + "T09:00:00Z"
-    days_today = 0
-    days_yesterday = 1
-    while(today != latest_date):
-        days_today += 1
-        days_yesterday += 1
-        for particle in particles:
-            for station in stations:
-                conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
-                payload = ''
-                headers = {}
-                conn.request("GET", f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}", payload, headers)
-                res = conn.getresponse()
-                data = res.read()
-                decoded_data = data.decode("utf-8")
-                df = pd.read_csv(StringIO(decoded_data))
-                df = df.filter(like='value')
-                all_dataframes.append(df)
-            combined_data = pd.concat(all_dataframes, ignore_index=True)
-            combined_data.to_csv(f'{particle}_{today}.csv', index=False)
-        today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z"
-        yesterday = (date.today() - timedelta(days_yesterday)).isoformat() + "T09:00:00Z"
-def delete_csv(csvs):
-    for csv in csvs:
-        if(os.path.exists(csv) and os.path.isfile(csv)):
-            os.remove(csv)
-def clean_values():
-    particles = ["NO2", "O3"]
-    csvs = []
-    NO2 = []
-    O3 = []
-    today = date.today().isoformat() + "T09:00:00Z"
-    yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
-    latest_date = (date.today() - timedelta(8)).isoformat() + "T09:00:00Z"
-    days_today = 0
-    while(today != latest_date):
-        for particle in particles:
-            name = f'{particle}_{today}.csv'
-            csvs.append(name)
-        days_today += 1
-        today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z"
-    for csv_file in csvs:
-        values = []  # Reset values for each CSV file
-        # Open the CSV file and read the values
-        with open(csv_file, 'r') as file:
-            reader = csv.reader(file)
-            for row in reader:
-                for value in row:
-                    # Use regular expressions to extract numeric part
-                    cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", value)
-                    if cleaned_value:  # If we successfully extract a number
-                        values.append(float(cleaned_value[0]))  # Convert the first match to float
-        # Compute the average if the values list is not empty
-        if values:
-            avg = sum(values) / len(values)
-            if "NO2" in csv_file:
-                NO2.append(avg)
-            else:
-                O3.append(avg)
-    delete_csv(csvs)
-    return NO2, O3
-def add_columns():
-    file_path = 'weather_data.csv'
-    df = pd.read_csv(file_path)
-    df.insert(1, 'NO2', None)
-    df.insert(2, 'O3', None)
-    df.insert(10, 'weekday', None)
-    return df
-def scale(data):
-    df = data
-    columns = list(df.columns)
-    columns.insert(3, columns.pop(6))
-    df = df[columns]
-    columns.insert(5, columns.pop(9))
-    df = df[columns]
-    columns.insert(9, columns.pop(6))
-    df = df[columns]
-    df = df.rename(columns={
-        'datetime':'date',
-        'windspeed': 'wind_speed',
-        'temp': 'mean_temp',
-        'solarradiation':'global_radiation',
-        'precip':'percipitation',
-        'sealevelpressure':'pressure',
-        'visibility':'minimum_visibility'
-    })
-    df['date'] = pd.to_datetime(df['date'])
-    df['weekday'] = df['date'].dt.day_name()
-    df['wind_speed'] = (df['wind_speed'] / 3.6) * 10
-    df['mean_temp'] = df['mean_temp'] * 10
-    df['minimum_visibility'] = df['minimum_visibility'] * 10
-    df['percipitation'] = df['percipitation'] * 10
-    df['pressure'] = df['pressure'] * 10
-    df['wind_speed'] = df['wind_speed'].astype(int)
-    df['mean_temp'] = df['mean_temp'].astype(int)
-    df['minimum_visibility'] = df['minimum_visibility'].astype(int)
-    df['percipitation'] = df['percipitation'].astype(int)
-    df['pressure'] = df['pressure'].astype(int)
-    df['humidity'] = df['humidity'].astype(int)
-    df['global_radiation'] = df['global_radiation'].astype(int)
-    return df
-def insert_pollution(NO2, O3, data):
-    df = data
-    start_index = 0
-    while NO2:
-        df.loc[start_index, 'NO2'] = NO2.pop()
-        start_index += 1
-    start_index = 0
-    while O3:
-        df.loc[start_index, 'O3'] = O3.pop()
-        start_index += 1
-    return df
-def weather_data():
-  today = date.today().isoformat()
-  seven_days = (date.today() - timedelta(7)).isoformat()
-  try:
-    ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{seven_days}/{today}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv")
-    # Parse the results as CSV
-    CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8'))
-    # Saving the CSV content to a file
-    current_dir = os.path.dirname(os.path.realpath(__file__))
-    file_path = os.path.join(current_dir, 'weather_data.csv')
-    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
-          csv_writer = csv.writer(csvfile)
-          csv_writer.writerows(CSVText)
-  except urllib.error.HTTPError  as e:
-    ErrorInfo= e.read().decode()
-    print('Error code: ', e.code, ErrorInfo)
-    sys.exit()
-  except  urllib.error.URLError as e:
-    ErrorInfo= e.read().decode()
-    print('Error code: ', e.code,ErrorInfo)
-    sys.exit()
-def get_data():
-    weather_data()
-    pollution_data()
-    NO2, O3 = clean_values()
-    df = add_columns()
-    scaled_df = scale(df)
-    output_df = insert_pollution(NO2, O3, scaled_df)
-    os.remove('weather_data.csv')
-    return output_df

pages/admin.py CHANGED Viewed

@@ -1,8 +1,63 @@
-import streamlit as st
-import pandas as pd
 import numpy as np
-from sklearn.linear_model import LinearRegression
-import joblib
-# Title
-st.title("Admin Panel")

 import numpy as np
+import pandas as pd
+import streamlit as st
+USERNAME = "admin"
+PASSWORD = "password"
+st.title("Admin Panel")
+# Login Form
+login_success = False
+with st.form("login_form"):
+    st.write("Please login to access the admin dashboard:")
+    username = st.text_input("Username")
+    password = st.text_input("Password", type="password")
+    login_button = st.form_submit_button("Login")
+    if login_button:
+        if username == USERNAME and password == PASSWORD:
+            login_success = True
+            st.success("Login successful!")
+        else:
+            st.error("Invalid username or password.")
+# After successful login
+if login_success:
+    # Display information about model performance
+    st.header("Model Performance Metrics")
+    model_r2_score = 0.85  # Mock R^2 Score
+    avg_prediction_time = 0.15  # Mock Average Prediction Time in seconds
+    num_predictions_made = 2000  # Mock Number of Predictions Made
+    st.metric(label="R² Score", value=f"{model_r2_score:.2f}")
+    st.metric(
+        label="Average Prediction Time", value=f"{avg_prediction_time:.2f} seconds"
+    )
+    st.metric(label="Total Predictions Made", value=num_predictions_made)
+    st.subheader("Detailed Metrics")
+    detailed_metrics = pd.DataFrame(
+        {
+            "Metric": ["MAE", "MSE", "RMSE", "Training Time"],
+            "Value": [2.5, 3.4, 1.8, "1.2 hours"],
+        }
+    )
+    st.table(detailed_metrics)
+    # Mocking prediction latency over time (example chart)
+    st.subheader("Prediction Latency Over Time")
+    latency_data = pd.DataFrame(
+        {
+            "Date": pd.date_range(end=pd.Timestamp.today(), periods=7).to_list(),
+            "Prediction Time (s)": np.random.uniform(0.1, 0.5, 7),
+        }
+    )
+    st.line_chart(latency_data.set_index("Date"))
+    # Button to simulate refreshing metrics
+    if st.button("Refresh Metrics"):
+        st.experimental_rerun()
+else:
+    st.warning("Please login to access the admin panel.")

pollution_data.csv ADDED Viewed

	@@ -0,0 +1,9 @@

+date,NO2,O3
+2024-10-17,22.804605103280675,22.769159859976643
+2024-10-18,23.2685,23.30733245729302
+2024-10-19,23.91006441223834,23.1717142857143
+2024-10-20,22.573237547892735,23.53784452296821
+2024-10-21,21.1457004830918,24.020695652173934
+2024-10-22,21.776579804560274,23.33588571428572
+2024-10-23,21.974793814433,22.21468879668051
+2024-10-24,25.51256756756757,20.91370967741937

python.py DELETED Viewed

@@ -1,3 +0,0 @@
-from data_api_calls import get_data
-get_data()

src/data_api_calls.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import codecs
+import csv
+import http.client
+import os
+import re
+import sys
+import urllib.request
+from datetime import date, timedelta
+from io import StringIO
+import pandas as pd
+WEATHER_DATA_FILE = "weather_data.csv"
+POLLUTION_DATA_FILE = "pollution_data.csv"
+def update_weather_data():
+    today = date.today().isoformat()
+    if os.path.exists(WEATHER_DATA_FILE):
+        df = pd.read_csv(WEATHER_DATA_FILE)
+        last_date = pd.to_datetime(df["date"]).max()
+        start_date = (last_date + timedelta(1)).isoformat()
+    else:
+        df = pd.DataFrame()
+        start_date = (date.today() - timedelta(7)).isoformat()
+    try:
+        ResultBytes = urllib.request.urlopen(
+            f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{today}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
+        )
+        CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
+        new_data = pd.DataFrame(list(CSVText))
+        new_data.columns = new_data.iloc[0]
+        new_data = new_data[1:]
+        new_data = new_data.rename(columns={"datetime": "date"})
+        updated_df = pd.concat([df, new_data], ignore_index=True)
+        updated_df.drop_duplicates(subset="date", keep="last", inplace=True)
+        updated_df.to_csv(WEATHER_DATA_FILE, index=False)
+    except urllib.error.HTTPError as e:
+        ErrorInfo = e.read().decode()
+        print("Error code: ", e.code, ErrorInfo)
+        sys.exit()
+    except urllib.error.URLError as e:
+        ErrorInfo = e.read().decode()
+        print("Error code: ", e.code, ErrorInfo)
+        sys.exit()
+def update_pollution_data():
+    O3 = []
+    NO2 = []
+    particles = ["NO2", "O3"]
+    stations = ["NL10636", "NL10639", "NL10643"]
+    all_dataframes = []
+    today = date.today().isoformat() + "T09:00:00Z"
+    yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
+    latest_date = (date.today() - timedelta(8)).isoformat() + "T09:00:00Z"
+    days_today = 0
+    days_yesterday = 1
+    while today != latest_date:
+        days_today += 1
+        days_yesterday += 1
+        for particle in particles:
+            for station in stations:
+                conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
+                payload = ""
+                headers = {}
+                conn.request(
+                    "GET",
+                    f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
+                    payload,
+                    headers,
+                )
+                res = conn.getresponse()
+                data = res.read()
+                decoded_data = data.decode("utf-8")
+                df = pd.read_csv(StringIO(decoded_data))
+                df = df.filter(like="value")
+                all_dataframes.append(df)
+            combined_data = pd.concat(all_dataframes, ignore_index=True)
+            values = []
+            for row in combined_data:
+                cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
+                if cleaned_value:  # If we successfully extract a number
+                    values.append(
+                        float(cleaned_value[0])
+                    )  # Convert the first match to float
+            # Compute the average if the values list is not empty
+            if values:
+                avg = sum(values) / len(values)
+                if particle == "NO2":
+                    NO2.append(avg)
+                else:
+                    O3.append(avg)
+        today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z"
+        yesterday = (
+            date.today() - timedelta(days_yesterday)
+        ).isoformat() + "T09:00:00Z"
+    avg_combined_data = pd.DataFrame(
+        {
+            "date": pd.date_range(end=date.today(), periods=len(NO2)),
+            "NO2": NO2,
+            "O3": O3,
+        }
+    )
+    avg_combined_data = reverse_pollution(NO2, O3, avg_combined_data)
+    if os.path.exists(POLLUTION_DATA_FILE):
+        existing_data = pd.read_csv(POLLUTION_DATA_FILE)
+        last_date = pd.to_datetime(existing_data["date"]).max()
+        new_data = avg_combined_data[avg_combined_data["date"] > last_date]
+        updated_data = pd.concat([existing_data, new_data], ignore_index=True)
+        updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
+    else:
+        updated_data = avg_combined_data
+    updated_data.to_csv(POLLUTION_DATA_FILE, index=False)
+def reverse_pollution(NO2, O3, data):
+    df = data
+    start_index = 0
+    while NO2:
+        df.loc[start_index, "NO2"] = NO2.pop()
+        start_index += 1
+    start_index = 0
+    while O3:
+        df.loc[start_index, "O3"] = O3.pop()
+        start_index += 1
+    return df
+def get_combined_data():
+    update_weather_data()
+    update_pollution_data()
+    weather_df = pd.read_csv(WEATHER_DATA_FILE)
+    pollution_df = pd.read_csv(POLLUTION_DATA_FILE)
+    # Average NO2 and O3 values by date and add them to weather data
+    combined_df = pd.merge(weather_df, pollution_df, on="date", how="left")
+    combined_df.fillna(0, inplace=True)
+    # Apply scaling and renaming similar to the scale function from previous code
+    combined_df = combined_df.rename(
+        columns={
+            "date": "date",
+            "windspeed": "wind_speed",
+            "temp": "mean_temp",
+            "solarradiation": "global_radiation",
+            "precip": "percipitation",
+            "sealevelpressure": "pressure",
+            "visibility": "minimum_visibility",
+        }
+    )
+    combined_df["date"] = pd.to_datetime(combined_df["date"])
+    combined_df["weekday"] = combined_df["date"].dt.day_name()
+    combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
+    combined_df["mean_temp"] = combined_df["mean_temp"] * 10
+    combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
+    combined_df["percipitation"] = combined_df["percipitation"] * 10
+    combined_df["pressure"] = combined_df["pressure"] * 10
+    combined_df["wind_speed"] = combined_df["wind_speed"].astype(int)
+    combined_df["mean_temp"] = combined_df["mean_temp"].astype(int)
+    combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(int)
+    combined_df["percipitation"] = combined_df["percipitation"].astype(int)
+    combined_df["pressure"] = combined_df["pressure"].astype(int)
+    combined_df["humidity"] = combined_df["humidity"].astype(int)
+    combined_df["global_radiation"] = combined_df["global_radiation"].astype(int)
+    return combined_df

src/{data_loading.py → features_pipeline.py} RENAMED Viewed

@@ -1,8 +1,12 @@
 import joblib
 import numpy as np
 import pandas as pd
-from past_data_api_calls import get_past_data
 def create_features(
@@ -11,37 +15,6 @@ def create_features(
     lag_days=7,
     sma_days=7,
 ):
-    """
-    Creates lagged features, SMA features, last year's particle data (NO2 and O3) for specific days,
-    sine and cosine transformations for 'weekday' and 'month', and target variables for the specified
-    particle ('O3' or 'NO2') for the next 'days_ahead' days. Scales features and targets without
-    disregarding outliers and saves the scalers for inverse scaling. Splits the data into train,
-    validation, and test sets using the most recent dates. Prints the number of rows with missing
-    values dropped from the dataset.
-    Parameters:
-    - data (pd.DataFrame): The input time-series dataset.
-    - target_particle (str): The target particle ('O3' or 'NO2') for which targets are created.
-    - lag_days (int): Number of lag days to create features for (default 7).
-    - sma_days (int): Window size for Simple Moving Average (default 7).
-    - days_ahead (int): Number of days ahead to create target variables for (default 3).
-    Returns:
-    - X_train_scaled (pd.DataFrame): Scaled training features.
-    - y_train_scaled (pd.DataFrame): Scaled training targets.
-    - X_val_scaled (pd.DataFrame): Scaled validation features (365 days).
-    - y_val_scaled (pd.DataFrame): Scaled validation targets (365 days).
-    - X_test_scaled (pd.DataFrame): Scaled test features (365 days).
-    - y_test_scaled (pd.DataFrame): Scaled test targets (365 days).
-    """
-    import warnings
-    import numpy as np
-    import pandas as pd
-    from sklearn.preprocessing import StandardScaler
-    warnings.filterwarnings("ignore")
     lag_features = [
         "NO2",
         "O3",
@@ -70,9 +43,7 @@ def create_features(
     # Create sine and cosine transformations for 'weekday' and 'month'
     data["weekday_sin"] = np.sin(2 * np.pi * data["weekday"] / 7)
     data["weekday_cos"] = np.cos(2 * np.pi * data["weekday"] / 7)
-    data["month_sin"] = np.sin(
-        2 * np.pi * (data["month"] - 1) / 12
-    )  # Adjust month to 0-11
     data["month_cos"] = np.cos(2 * np.pi * (data["month"] - 1) / 12)
     # Create lagged features for the specified lag days
@@ -86,32 +57,26 @@ def create_features(
             data[feature].rolling(window=sma_days).mean()
         )
-    past_data = get_past_data()
     # Create particle data (NO2 and O3) from the same time last year
-    # Today last year
-    data["O3_last_year"] = past_data["O3"].iloc[-4] # data["O3_last_year"] = data["O3"].shift(365)
-    data["NO2_last_year"] = past_data["NO2"].iloc[-4] # data["NO2_last_year"] = data["NO2"].shift(365)
     # 7 days before today last year
-    for i in range(1, lag_days+1):
-        data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i-1] # data["O3"].shift(365 + i)
-        data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i-1] # data["NO2"].shift(365 + i)
     # 3 days after today last year
-    data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1] # data["O3"].shift(365 - 3)
-    data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1] # data["NO2"].shift(365 - 3)
-    # Calculate the number of rows before dropping missing values
-    rows_before = data.shape[0]
     # Drop missing values
     data = data.dropna().reset_index(drop=True)
-    # Calculate the number of rows after dropping missing values
     rows_after = data.shape[0]
-    # Calculate and print the number of rows dropped
     rows_dropped = rows_before - rows_after
     print(f"Number of rows with missing values dropped: {rows_dropped}")
@@ -125,16 +90,11 @@ def create_features(
     # Split features and targets
     x = data[feature_cols]
-    # Initialize scalers
     feature_scaler = joblib.load(f"scalers/feature_scaler_{target_particle}.joblib")
-    # Fit the scalers on the training data
     X_scaled = feature_scaler.transform(x)
     # Convert scaled data back to DataFrame for consistency
-    X_scaled = pd.DataFrame(
-        X_scaled, columns=feature_cols, index=x.index
-    )
     return X_scaled

+import warnings
 import joblib
 import numpy as np
 import pandas as pd
+from src.past_data_api_calls import get_past_combined_data
+warnings.filterwarnings("ignore")
 def create_features(
     lag_days=7,
     sma_days=7,
 ):
     lag_features = [
         "NO2",
         "O3",
     # Create sine and cosine transformations for 'weekday' and 'month'
     data["weekday_sin"] = np.sin(2 * np.pi * data["weekday"] / 7)
     data["weekday_cos"] = np.cos(2 * np.pi * data["weekday"] / 7)
+    data["month_sin"] = np.sin(2 * np.pi * (data["month"] - 1) / 12)
     data["month_cos"] = np.cos(2 * np.pi * (data["month"] - 1) / 12)
     # Create lagged features for the specified lag days
             data[feature].rolling(window=sma_days).mean()
         )
     # Create particle data (NO2 and O3) from the same time last year
+    past_data = get_past_combined_data()
+    # Today last year
+    data["O3_last_year"] = past_data["O3"].iloc[-4]
+    data["NO2_last_year"] = past_data["NO2"].iloc[-4]
     # 7 days before today last year
+    for i in range(1, lag_days + 1):
+        data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i - 1]
+        data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i - 1]
     # 3 days after today last year
+    data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1]
+    data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1]
     # Drop missing values
+    rows_before = data.shape[0]
     data = data.dropna().reset_index(drop=True)
     rows_after = data.shape[0]
     rows_dropped = rows_before - rows_after
     print(f"Number of rows with missing values dropped: {rows_dropped}")
     # Split features and targets
     x = data[feature_cols]
+    # Scale
     feature_scaler = joblib.load(f"scalers/feature_scaler_{target_particle}.joblib")
     X_scaled = feature_scaler.transform(x)
     # Convert scaled data back to DataFrame for consistency
+    X_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=x.index)
     return X_scaled

past_data_api_calls.py → src/past_data_api_calls copy.py RENAMED Viewed

@@ -17,7 +17,9 @@ def pollution_data():
     last_year_date = date.today() - timedelta(days=365)
     start_date = last_year_date - timedelta(days=7)
     end_date = last_year_date + timedelta(days=3)
-    date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
     for current_date in date_list:
         today = current_date.isoformat() + "T09:00:00Z"
         yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
@@ -25,24 +27,31 @@ def pollution_data():
             all_dataframes = []  # Reset for each particle
             for station in stations:
                 conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
-                payload = ''
                 headers = {}
-                conn.request("GET", f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}", payload, headers)
                 res = conn.getresponse()
                 data = res.read()
                 decoded_data = data.decode("utf-8")
                 df = pd.read_csv(StringIO(decoded_data))
-                df = df.filter(like='value')
                 all_dataframes.append(df)
             if all_dataframes:
                 combined_data = pd.concat(all_dataframes, ignore_index=True)
-                combined_data.to_csv(f'{particle}_{today}.csv', index=False)
 def delete_csv(csvs):
     for csv_file in csvs:
-        if(os.path.exists(csv_file) and os.path.isfile(csv_file)):
             os.remove(csv_file)
 def clean_values():
     particles = ["NO2", "O3"]
     csvs = []
@@ -51,25 +60,29 @@ def clean_values():
     last_year_date = date.today() - timedelta(days=365)
     start_date = last_year_date - timedelta(days=7)
     end_date = last_year_date + timedelta(days=3)
-    date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
     for current_date in date_list:
         today = current_date.isoformat() + "T09:00:00Z"
         for particle in particles:
-            name = f'{particle}_{today}.csv'
             csvs.append(name)
     for csv_file in csvs:
         if not os.path.exists(csv_file):
             continue  # Skip if the file doesn't exist
         values = []  # Reset values for each CSV file
         # Open the CSV file and read the values
-        with open(csv_file, 'r') as file:
             reader = csv.reader(file)
             for row in reader:
                 for value in row:
                     # Use regular expressions to extract numeric part
                     cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", value)
                     if cleaned_value:  # If we successfully extract a number
-                        values.append(float(cleaned_value[0]))  # Convert the first match to float
         # Compute the average if the values list is not empty
         if values:
@@ -81,16 +94,18 @@ def clean_values():
     delete_csv(csvs)
     return NO2, O3
 def add_columns():
-    file_path = 'weather_data.csv'
     df = pd.read_csv(file_path)
-    df.insert(1, 'NO2', None)
-    df.insert(2, 'O3', None)
-    df.insert(10, 'weekday', None)
     return df
 def scale(data):
     df = data
     columns = list(df.columns)
@@ -104,97 +119,72 @@ def scale(data):
     columns.insert(9, columns.pop(6))
     df = df[columns]
-    df = df.rename(columns={
-        'datetime':'date',
-        'windspeed': 'wind_speed',
-        'temp': 'mean_temp',
-        'solarradiation':'global_radiation',
-        'precip':'percipitation',
-        'sealevelpressure':'pressure',
-        'visibility':'minimum_visibility'
-    })
-    df['date'] = pd.to_datetime(df['date'])
-    df['weekday'] = df['date'].dt.day_name()
-    df = df.sort_values(by='date').reset_index(drop=True)
-    df['wind_speed'] = (df['wind_speed'] / 3.6) * 10
-    df['mean_temp'] = df['mean_temp'] * 10
-    df['minimum_visibility'] = df['minimum_visibility'] * 10
-    df['percipitation'] = df['percipitation'] * 10
-    df['pressure'] = df['pressure']
-    df['wind_speed'] = df['wind_speed'].astype(int)
-    df['mean_temp'] = df['mean_temp'].astype(int)
-    df['minimum_visibility'] = df['minimum_visibility'].astype(int)
-    df['percipitation'] = df['percipitation'].astype(int)
-    df['pressure'] = df['pressure'].astype(int)
-    df['humidity'] = df['humidity'].astype(int)
-    df['global_radiation'] = df['global_radiation'].astype(int)
     return df
 def insert_pollution(NO2, O3, data):
     df = data
-    df['NO2'] = NO2
-    df['O3'] = O3
     return df
 def weather_data():
-    # Get last year's same day
     last_year_date = date.today() - timedelta(days=365)
-    # Start date is 7 days prior
     start_date = (last_year_date - timedelta(days=7)).isoformat()
-    # End date is 3 days ahead
     end_date = (last_year_date + timedelta(days=3)).isoformat()
-    try:
-        ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv")
-        # Parse the results as CSV
-        CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8'))
-        # Saving the CSV content to a file
-        current_dir = os.path.dirname(os.path.realpath(__file__))
-        file_path = os.path.join(current_dir, 'weather_data.csv')
-        with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
-              csv_writer = csv.writer(csvfile)
-              csv_writer.writerows(CSVText)
-    except urllib.error.HTTPError  as e:
-        ErrorInfo= e.read().decode()
-        print('Error code: ', e.code, ErrorInfo)
-        sys.exit()
-    except  urllib.error.URLError as e:
-        ErrorInfo= e.read().decode()
-        print('Error code: ', e.code,ErrorInfo)
-        sys.exit()
-def weather_data():
-    # Set up dates for last year: 7 days before today last year, and 3 days ahead of this day last year
-    today_last_year = date.today() - timedelta(365)
-    start_last_year = today_last_year - timedelta(8)
-    end_last_year = today_last_year + timedelta(2)
-    try:
-        # API call with new date range for last year
-        ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_last_year}/{end_last_year}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv")
         # Parse the results as CSV
-        CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8'))
         # Saving the CSV content to a file
         current_dir = os.path.dirname(os.path.realpath(__file__))
-        file_path = os.path.join(current_dir, 'weather_data.csv')
-        with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
             csv_writer = csv.writer(csvfile)
             csv_writer.writerows(CSVText)
     except urllib.error.HTTPError as e:
-        ErrorInfo = e.read().decode()
-        print('Error code: ', e.code, ErrorInfo)
         sys.exit()
     except urllib.error.URLError as e:
-        ErrorInfo = e.read().decode()
-        print('Error code: ', e.code, ErrorInfo)
         sys.exit()
@@ -205,5 +195,5 @@ def get_past_data():
     df = add_columns()
     scaled_df = scale(df)
     output_df = insert_pollution(NO2, O3, scaled_df)
-    os.remove('weather_data.csv')
-    return output_df

     last_year_date = date.today() - timedelta(days=365)
     start_date = last_year_date - timedelta(days=7)
     end_date = last_year_date + timedelta(days=3)
+    date_list = [
+        start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
+    ]
     for current_date in date_list:
         today = current_date.isoformat() + "T09:00:00Z"
         yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
             all_dataframes = []  # Reset for each particle
             for station in stations:
                 conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
+                payload = ""
                 headers = {}
+                conn.request(
+                    "GET",
+                    f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
+                    payload,
+                    headers,
+                )
                 res = conn.getresponse()
                 data = res.read()
                 decoded_data = data.decode("utf-8")
                 df = pd.read_csv(StringIO(decoded_data))
+                df = df.filter(like="value")
                 all_dataframes.append(df)
             if all_dataframes:
                 combined_data = pd.concat(all_dataframes, ignore_index=True)
+                combined_data.to_csv(f"{particle}_{today}.csv", index=False)
 def delete_csv(csvs):
     for csv_file in csvs:
+        if os.path.exists(csv_file) and os.path.isfile(csv_file):
             os.remove(csv_file)
 def clean_values():
     particles = ["NO2", "O3"]
     csvs = []
     last_year_date = date.today() - timedelta(days=365)
     start_date = last_year_date - timedelta(days=7)
     end_date = last_year_date + timedelta(days=3)
+    date_list = [
+        start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
+    ]
     for current_date in date_list:
         today = current_date.isoformat() + "T09:00:00Z"
         for particle in particles:
+            name = f"{particle}_{today}.csv"
             csvs.append(name)
     for csv_file in csvs:
         if not os.path.exists(csv_file):
             continue  # Skip if the file doesn't exist
         values = []  # Reset values for each CSV file
         # Open the CSV file and read the values
+        with open(csv_file, "r") as file:
             reader = csv.reader(file)
             for row in reader:
                 for value in row:
                     # Use regular expressions to extract numeric part
                     cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", value)
                     if cleaned_value:  # If we successfully extract a number
+                        values.append(
+                            float(cleaned_value[0])
+                        )  # Convert the first match to float
         # Compute the average if the values list is not empty
         if values:
     delete_csv(csvs)
     return NO2, O3
 def add_columns():
+    file_path = "weather_data.csv"
     df = pd.read_csv(file_path)
+    df.insert(1, "NO2", None)
+    df.insert(2, "O3", None)
+    df.insert(10, "weekday", None)
     return df
 def scale(data):
     df = data
     columns = list(df.columns)
     columns.insert(9, columns.pop(6))
     df = df[columns]
+    df = df.rename(
+        columns={
+            "datetime": "date",
+            "windspeed": "wind_speed",
+            "temp": "mean_temp",
+            "solarradiation": "global_radiation",
+            "precip": "percipitation",
+            "sealevelpressure": "pressure",
+            "visibility": "minimum_visibility",
+        }
+    )
+    df["date"] = pd.to_datetime(df["date"])
+    df["weekday"] = df["date"].dt.day_name()
+    df = df.sort_values(by="date").reset_index(drop=True)
+    df["wind_speed"] = (df["wind_speed"] / 3.6) * 10
+    df["mean_temp"] = df["mean_temp"] * 10
+    df["minimum_visibility"] = df["minimum_visibility"] * 10
+    df["percipitation"] = df["percipitation"] * 10
+    df["pressure"] = df["pressure"]
+    df["wind_speed"] = df["wind_speed"].astype(int)
+    df["mean_temp"] = df["mean_temp"].astype(int)
+    df["minimum_visibility"] = df["minimum_visibility"].astype(int)
+    df["percipitation"] = df["percipitation"].astype(int)
+    df["pressure"] = df["pressure"].astype(int)
+    df["humidity"] = df["humidity"].astype(int)
+    df["global_radiation"] = df["global_radiation"].astype(int)
     return df
 def insert_pollution(NO2, O3, data):
     df = data
+    df["NO2"] = NO2
+    df["O3"] = O3
     return df
 def weather_data():
     last_year_date = date.today() - timedelta(days=365)
     start_date = (last_year_date - timedelta(days=7)).isoformat()
     end_date = (last_year_date + timedelta(days=3)).isoformat()
+    try:
+        ResultBytes = urllib.request.urlopen(
+            f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
+        )
         # Parse the results as CSV
+        CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
         # Saving the CSV content to a file
         current_dir = os.path.dirname(os.path.realpath(__file__))
+        file_path = os.path.join(current_dir, "past_weather_data.csv")
+        with open(file_path, "w", newline="", encoding="utf-8") as csvfile:
             csv_writer = csv.writer(csvfile)
             csv_writer.writerows(CSVText)
     except urllib.error.HTTPError as e:
+        ErrorInfo = e.read().decode()
+        print("Error code: ", e.code, ErrorInfo)
         sys.exit()
     except urllib.error.URLError as e:
+        ErrorInfo = e.read().decode()
+        print("Error code: ", e.code, ErrorInfo)
         sys.exit()
     df = add_columns()
     scaled_df = scale(df)
     output_df = insert_pollution(NO2, O3, scaled_df)
+    os.remove("past_weather_data.csv")
+    return output_df

src/past_data_api_calls.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import codecs
+import csv
+import http.client
+import re
+import sys
+import urllib.request
+from datetime import date, timedelta
+from io import StringIO
+import pandas as pd
+PAST_WEATHER_DATA_FILE = "weather_data.csv"
+PAST_POLLUTION_DATA_FILE = "pollution_data.csv"
+def get_past_weather_data():
+    last_year_date = date.today() - timedelta(days=365)
+    start_date = (last_year_date - timedelta(days=7)).isoformat()
+    end_date = (last_year_date + timedelta(days=3)).isoformat()
+    try:
+        ResultBytes = urllib.request.urlopen(
+            f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
+        )
+        CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
+        data = pd.DataFrame(list(CSVText))
+        data.columns = data.iloc[0]
+        data = data[1:]
+        data = data.rename(columns={"datetime": "date"})
+        return data
+    except urllib.error.HTTPError as e:
+        ErrorInfo = e.read().decode()
+        print("Error code: ", e.code, ErrorInfo)
+        sys.exit()
+    except urllib.error.URLError as e:
+        ErrorInfo = e.read().decode()
+        print("Error code: ", e.code, ErrorInfo)
+        sys.exit()
+def get_past_pollution_data():
+    O3 = []
+    NO2 = []
+    particles = ["NO2", "O3"]
+    stations = ["NL10636", "NL10639", "NL10643"]
+    all_dataframes = []
+    last_year_date = date.today() - timedelta(days=365)
+    start_date = last_year_date - timedelta(days=7)
+    end_date = last_year_date + timedelta(days=3)
+    date_list = [
+        start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
+    ]
+    for current_date in date_list:
+        today = current_date.isoformat() + "T09:00:00Z"
+        yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
+        for particle in particles:
+            all_dataframes = []  # Reset for each particle
+            for station in stations:
+                conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
+                payload = ""
+                headers = {}
+                conn.request(
+                    "GET",
+                    f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
+                    payload,
+                    headers,
+                )
+                res = conn.getresponse()
+                data = res.read()
+                decoded_data = data.decode("utf-8")
+                df = pd.read_csv(StringIO(decoded_data))
+                df = df.filter(like="value")
+                all_dataframes.append(df)
+            combined_data = pd.concat(all_dataframes, ignore_index=True)
+            values = []
+            for row in combined_data:
+                cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
+                if cleaned_value:
+                    values.append(float(cleaned_value[0]))
+            if values:
+                avg = sum(values) / len(values)
+                if particle == "NO2":
+                    NO2.append(avg)
+                else:
+                    O3.append(avg)
+    return NO2, O3
+def get_past_combined_data():
+    weather_df = get_past_weather_data()
+    NO2_df, O3_df = get_past_pollution_data()
+    combined_df = weather_df
+    combined_df["NO2"] = NO2_df
+    combined_df["O3"] = O3_df
+    # Apply scaling and renaming similar to the scale function from previous code
+    combined_df = combined_df.rename(
+        columns={
+            "date": "date",
+            "windspeed": "wind_speed",
+            "temp": "mean_temp",
+            "solarradiation": "global_radiation",
+            "precip": "percipitation",
+            "sealevelpressure": "pressure",
+            "visibility": "minimum_visibility",
+        }
+    )
+    combined_df["date"] = pd.to_datetime(combined_df["date"])
+    combined_df["weekday"] = combined_df["date"].dt.day_name()
+    combined_df["wind_speed"] = combined_df["wind_speed"].astype(float)
+    combined_df["mean_temp"] = combined_df["mean_temp"].astype(float)
+    combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(float)
+    combined_df["percipitation"] = combined_df["percipitation"].astype(float)
+    combined_df["pressure"] = combined_df["pressure"].astype(float).round()
+    combined_df["humidity"] = combined_df["humidity"].astype(float).round()
+    combined_df["global_radiation"] = combined_df["global_radiation"].astype(float)
+    combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
+    combined_df["mean_temp"] = combined_df["mean_temp"] * 10
+    combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
+    combined_df["percipitation"] = combined_df["percipitation"] * 10
+    combined_df["pressure"] = combined_df["pressure"] * 10
+    combined_df["wind_speed"] = combined_df["wind_speed"].astype(float).round().astype(int)
+    combined_df["mean_temp"] = combined_df["mean_temp"].astype(float).round().astype(int)
+    combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(float).round().astype(int)
+    combined_df["percipitation"] = combined_df["percipitation"].astype(float).round().astype(int)
+    combined_df["pressure"] = combined_df["pressure"].astype(float).round().astype(int)
+    combined_df["humidity"] = combined_df["humidity"].astype(float).round().astype(int)
+    combined_df["global_radiation"] = combined_df["global_radiation"].astype(float).round().astype(int)
+    return combined_df

src/{models_loading.py → predict.py} RENAMED Viewed

@@ -1,12 +1,15 @@
 import os
 import joblib
-import pandas as pd
 import streamlit as st
 from dotenv import load_dotenv
 from huggingface_hub import hf_hub_download, login
-from src.data_loading import create_features
 def load_model(particle):
     load_dotenv()
     login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
@@ -15,21 +18,30 @@ def load_model(particle):
     if particle == "O3":
         file_name = "O3_svr_model.pkl"
     elif particle == "NO2":
-        file_name == "hehehe"
     model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
     model = joblib.load(model_path)
     return model
-@st.cache_resource(ttl=6 * 300)  # Reruns every 6 hours
 def run_model(particle, data):
     input_data = create_features(data=data, target_particle=particle)
     model = load_model(particle)
-    # Run the model with static input
     prediction = model.predict(input_data)
     target_scaler = joblib.load(f"scalers/target_scaler_{particle}.joblib")
     prediction = target_scaler.inverse_transform(prediction)
     return prediction

 import os
 import joblib
 import streamlit as st
 from dotenv import load_dotenv
 from huggingface_hub import hf_hub_download, login
+from src.data_api_calls import get_combined_data
+from src.features_pipeline import create_features
+@st.cache_resource()
 def load_model(particle):
     load_dotenv()
     login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
     if particle == "O3":
         file_name = "O3_svr_model.pkl"
     elif particle == "NO2":
+        file_name == "NO2_nn_model.pkl"
     model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
     model = joblib.load(model_path)
     return model
 def run_model(particle, data):
     input_data = create_features(data=data, target_particle=particle)
     model = load_model(particle)
     prediction = model.predict(input_data)
     target_scaler = joblib.load(f"scalers/target_scaler_{particle}.joblib")
     prediction = target_scaler.inverse_transform(prediction)
     return prediction
+def get_data_and_predictions():
+    PREDICTIONS_FILE = "predictions_history.csv"
+    week_data = get_combined_data()
+    o3_input_features = create_features(week_data, "O3")
+    no2_input_features = create_features(week_data, "NO2")
+    o3_predictions = run_model("O3", data=o3_input_features)
+    no2_predictions = run_model("NO2", data=no2_input_features)
+    return week_data, o3_predictions, no2_predictions

test.ipynb CHANGED Viewed

@@ -9,34 +9,63 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/mihkelmariuszjezierski/anaconda3/envs/ml-industry/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
    ],
    "source": [
-    "from src.models_loading import run_model\n",
-    "from data_api_calls import get_data\n",
-    "import pandas as pd\n",
-    "from past_data_api_calls import get_past_data"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from src.data_loading import create_features"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = get_data()"
    ]
   },
   {
@@ -672,7 +701,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "ml-industry",
    "language": "python",
    "name": "python3"
   },
@@ -686,7 +715,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.5"
   }
  },
  "nbformat": 4,

      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "c:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
    ],
    "source": [
+    "from src.data_api_calls import get_combined_data\n",
+    "from src.past_data_api_calls import get_past_combined_data\n",
+    "from src.predict import get_data_and_predictions"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "get_past_combined_data()"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "OSError",
+     "evalue": "[Errno 22] Invalid argument: 'NO2_2023-10-18T09:00:00Z.csv'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m week_data, predictions_O3, predictions_NO2 \u001b[38;5;241m=\u001b[39m \u001b[43mget_data_and_predictions\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\predict.py:42\u001b[0m, in \u001b[0;36mget_data_and_predictions\u001b[1;34m()\u001b[0m\n\u001b[0;32m     38\u001b[0m PREDICTIONS_FILE \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpredictions_history.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m     40\u001b[0m week_data \u001b[38;5;241m=\u001b[39m get_combined_data()\n\u001b[1;32m---> 42\u001b[0m o3_input_features \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweek_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mO3\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m     43\u001b[0m no2_input_features \u001b[38;5;241m=\u001b[39m create_features(week_data, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNO2\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m     44\u001b[0m o3_predictions \u001b[38;5;241m=\u001b[39m run_model(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO3\u001b[39m\u001b[38;5;124m\"\u001b[39m, data\u001b[38;5;241m=\u001b[39mo3_input_features)\n",
+      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\features_pipeline.py:61\u001b[0m, in \u001b[0;36mcreate_features\u001b[1;34m(data, target_particle, lag_days, sma_days)\u001b[0m\n\u001b[0;32m     56\u001b[0m     data[\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfeature\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_sma_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msma_days\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m     57\u001b[0m         data[feature]\u001b[38;5;241m.\u001b[39mrolling(window\u001b[38;5;241m=\u001b[39msma_days)\u001b[38;5;241m.\u001b[39mmean()\n\u001b[0;32m     58\u001b[0m     )\n\u001b[0;32m     60\u001b[0m \u001b[38;5;66;03m# Create particle data (NO2 and O3) from the same time last year\u001b[39;00m\n\u001b[1;32m---> 61\u001b[0m past_data \u001b[38;5;241m=\u001b[39m \u001b[43mget_past_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     63\u001b[0m \u001b[38;5;66;03m# Today last year\u001b[39;00m\n\u001b[0;32m     64\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO3_last_year\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m past_data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO3\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39miloc[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m4\u001b[39m]\n",
+      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\past_data_api_calls.py:193\u001b[0m, in \u001b[0;36mget_past_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m    191\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_past_data\u001b[39m():\n\u001b[0;32m    192\u001b[0m     weather_data()\n\u001b[1;32m--> 193\u001b[0m     \u001b[43mpollution_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    194\u001b[0m     NO2, O3 \u001b[38;5;241m=\u001b[39m clean_values()\n\u001b[0;32m    195\u001b[0m     df \u001b[38;5;241m=\u001b[39m add_columns()\n",
+      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\past_data_api_calls.py:46\u001b[0m, in \u001b[0;36mpollution_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m     44\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m all_dataframes:\n\u001b[0;32m     45\u001b[0m     combined_data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat(all_dataframes, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m---> 46\u001b[0m     \u001b[43mcombined_data\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mparticle\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m_\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mtoday\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\util\\_decorators.py:333\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    327\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m    328\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m    329\u001b[0m         msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m    330\u001b[0m         \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m    331\u001b[0m         stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m    332\u001b[0m     )\n\u001b[1;32m--> 333\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:3967\u001b[0m, in \u001b[0;36mNDFrame.to_csv\u001b[1;34m(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, lineterminator, chunksize, date_format, doublequote, escapechar, decimal, errors, storage_options)\u001b[0m\n\u001b[0;32m   3956\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m, ABCDataFrame) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mto_frame()\n\u001b[0;32m   3958\u001b[0m formatter \u001b[38;5;241m=\u001b[39m DataFrameFormatter(\n\u001b[0;32m   3959\u001b[0m     frame\u001b[38;5;241m=\u001b[39mdf,\n\u001b[0;32m   3960\u001b[0m     header\u001b[38;5;241m=\u001b[39mheader,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   3964\u001b[0m     decimal\u001b[38;5;241m=\u001b[39mdecimal,\n\u001b[0;32m   3965\u001b[0m )\n\u001b[1;32m-> 3967\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mDataFrameRenderer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mformatter\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_csv\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   3968\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath_or_buf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3969\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlineterminator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlineterminator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3970\u001b[0m \u001b[43m    \u001b[49m\u001b[43msep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3971\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3972\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3973\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcompression\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3974\u001b[0m \u001b[43m    \u001b[49m\u001b[43mquoting\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquoting\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3975\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3976\u001b[0m \u001b[43m    \u001b[49m\u001b[43mindex_label\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_label\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3977\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3978\u001b[0m \u001b[43m    \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunksize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3979\u001b[0m \u001b[43m    \u001b[49m\u001b[43mquotechar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquotechar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3980\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdate_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdate_format\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3981\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdoublequote\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdoublequote\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3982\u001b[0m \u001b[43m    \u001b[49m\u001b[43mescapechar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mescapechar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3983\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   3984\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\io\\formats\\format.py:1014\u001b[0m, in \u001b[0;36mDataFrameRenderer.to_csv\u001b[1;34m(self, path_or_buf, encoding, sep, columns, index_label, mode, compression, quoting, quotechar, lineterminator, chunksize, date_format, doublequote, escapechar, errors, storage_options)\u001b[0m\n\u001b[0;32m    993\u001b[0m     created_buffer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m    995\u001b[0m csv_formatter \u001b[38;5;241m=\u001b[39m CSVFormatter(\n\u001b[0;32m    996\u001b[0m     path_or_buf\u001b[38;5;241m=\u001b[39mpath_or_buf,\n\u001b[0;32m    997\u001b[0m     lineterminator\u001b[38;5;241m=\u001b[39mlineterminator,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1012\u001b[0m     formatter\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfmt,\n\u001b[0;32m   1013\u001b[0m )\n\u001b[1;32m-> 1014\u001b[0m \u001b[43mcsv_formatter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1016\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m created_buffer:\n\u001b[0;32m   1017\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(path_or_buf, StringIO)\n",
+      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\io\\formats\\csvs.py:251\u001b[0m, in \u001b[0;36mCSVFormatter.save\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    247\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m    248\u001b[0m \u001b[38;5;124;03mCreate the writer & save.\u001b[39;00m\n\u001b[0;32m    249\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m    250\u001b[0m \u001b[38;5;66;03m# apply compression and byte/text conversion\u001b[39;00m\n\u001b[1;32m--> 251\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    252\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    253\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    254\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    255\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    256\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompression\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    257\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    258\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m handles:\n\u001b[0;32m    259\u001b[0m     \u001b[38;5;66;03m# Note: self.encoding is irrelevant here\u001b[39;00m\n\u001b[0;32m    260\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwriter \u001b[38;5;241m=\u001b[39m csvlib\u001b[38;5;241m.\u001b[39mwriter(\n\u001b[0;32m    261\u001b[0m         handles\u001b[38;5;241m.\u001b[39mhandle,\n\u001b[0;32m    262\u001b[0m         lineterminator\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlineterminator,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    267\u001b[0m         quotechar\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mquotechar,\n\u001b[0;32m    268\u001b[0m     )\n\u001b[0;32m    270\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_save()\n",
+      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\io\\common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m    868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m    869\u001b[0m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m    870\u001b[0m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m    871\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m    872\u001b[0m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[1;32m--> 873\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[0;32m    874\u001b[0m             handle,\n\u001b[0;32m    875\u001b[0m             ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[0;32m    876\u001b[0m             encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[0;32m    877\u001b[0m             errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[0;32m    878\u001b[0m             newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m    879\u001b[0m         )\n\u001b[0;32m    880\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    881\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m    882\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
+      "\u001b[1;31mOSError\u001b[0m: [Errno 22] Invalid argument: 'NO2_2023-10-18T09:00:00Z.csv'"
+     ]
+    }
+   ],
+   "source": [
+    "week_data, predictions_O3, predictions_NO2 = get_data_and_predictions()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "week_data"
    ]
   },
   {
  ],
  "metadata": {
   "kernelspec": {
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.8"
   }
  },
  "nbformat": 4,

test.py DELETED Viewed

	@@ -1,3 +0,0 @@
1	- from models_loading import run_model
2	-
3	-

weather_data.csv ADDED Viewed

	@@ -0,0 +1,9 @@

+date,temp,humidity,precip,windspeed,sealevelpressure,visibility,solarradiation
+2024-10-17,16.9,86.0,0.6,18.4,1010.0,37.1,43.0
+2024-10-18,15.5,97.3,3.9,7.6,1014.0,4.5,42.9
+2024-10-19,14.7,89.9,1.6,14.8,1014.1,22.8,43.5
+2024-10-20,15.5,83.8,0.5,29.5,1016.0,41.5,0.0
+2024-10-21,14.4,92.7,4.3,21.2,1020.6,22.0,27.8
+2024-10-22,11.4,92.8,4.9,19.4,1026.9,22.6,57.0
+2024-10-23,11.2,97.3,0.0,13.0,1032.8,6.5,12.5
+2024-10-24,10.4,94.0,0.0,20.5,1024.7,13.0,62.5