elisaklunder commited on
Commit
1d3c9ee
·
1 Parent(s): e3ae012

data pipelines

Browse files
app.py CHANGED
@@ -3,9 +3,8 @@ import pandas as pd
3
  import plotly.graph_objects as go
4
  import streamlit as st
5
 
6
- from data_api_calls import get_data
7
  from src.helper_functions import custom_metric_box, pollution_box
8
- from src.models_loading import run_model
9
 
10
  st.set_page_config(
11
  page_title="Utrecht Pollution Dashboard",
@@ -16,33 +15,24 @@ st.set_page_config(
16
 
17
  alt.themes.enable("dark")
18
 
19
- dataset = get_data()
20
- today = dataset.iloc[-1]
21
- previous_day = dataset.iloc[-2]
22
- prediction = run_model("O3", data=dataset)
23
- pred1 = prediction[0][0]
24
- pred2 = prediction[0][1]
25
- pred3 = prediction[0][2]
26
 
27
  dates_past = pd.date_range(end=pd.Timestamp.today(), periods=8).to_list()
28
- dates_future = pd.date_range(start=pd.Timestamp.today() + pd.Timedelta(days=1), periods=3).to_list()
 
 
29
 
30
  # O3 and NO2 values for the past 7 days
31
- o3_past_values = dataset["O3"]
32
- no2_past_values = dataset["NO2"]
33
-
34
- # Predicted O3 and NO2 values for the next 3 days (convert to pandas Series)
35
- o3_future_values = pd.Series(prediction[0].flatten()) # Flatten the array to 1D
36
- no2_future_values = pd.Series([26, 27, 28]) # Example prediction data
37
-
38
- # Combine the past and future values using pd.concat
39
  o3_values = pd.concat([o3_past_values, o3_future_values], ignore_index=True)
40
  no2_values = pd.concat([no2_past_values, no2_future_values], ignore_index=True)
41
 
42
- # Combine dates and values
43
  dates = dates_past + dates_future
44
-
45
- # Create a DataFrame
46
  df = pd.DataFrame({"Date": dates, "O3": o3_values, "NO2": no2_values})
47
 
48
 
@@ -55,13 +45,37 @@ with col1:
55
  st.subheader("Current Weather")
56
  subcol1, subcol2 = st.columns((1, 1))
57
  with subcol1:
58
- custom_metric_box(label="Temperature", value=f"{round(today['mean_temp'] * 0.1)} °C", delta=f"{round(today['mean_temp'] * 0.1) - round(previous_day['mean_temp'] * 0.1)} °C")
59
- custom_metric_box(label="Humidity", value=f"{round(today['humidity'])} %", delta=f"{round(today['humidity']) - round(previous_day['humidity'])} %")
60
- custom_metric_box(label="Pressure", value=f"{round(today['pressure'] * 0.1)} hPa", delta=f"{round(today['pressure'] * 0.1) - round(previous_day['pressure'] * 0.1)} hPa")
 
 
 
 
 
 
 
 
 
 
 
 
61
  with subcol2:
62
- custom_metric_box(label="Precipitation", value=f"{round(today['percipitation'] * 0.1)} mm", delta=f"{round(today['percipitation'] * 0.1) - round(previous_day['percipitation'] * 0.1)} mm")
63
- custom_metric_box(label="Solar Radiation", value=f"{round(today['global_radiation'])} J/m²", delta=f"{round(today['global_radiation']) - round(previous_day['global_radiation'])} J/m²")
64
- custom_metric_box(label="Wind Speed", value=f"{round(today['wind_speed'] * 0.1, 1)} m/s", delta=f"{round(today['wind_speed'] * 0.1, 1) - round(previous_day['wind_speed'] * 0.1, 1)} m/s")
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  with col2:
67
  st.subheader("Current Pollution Levels")
@@ -69,14 +83,22 @@ with col2:
69
  # Display the prediction
70
  # st.write(f'Predicted Pollution Level: {prediction[0]:.2f}')
71
  with sub1:
72
- pollution_box(label="O<sub>3</sub>", value=f"{round(today['O3'])} µg/m³", delta=f"{round(int(today['O3']) - int(previous_day['O3']))} µg/m³")
 
 
 
 
73
  with st.expander("Learn more about O3", expanded=False):
74
  st.markdown(
75
  "*Ozone (O<sub>3</sub>)*: A harmful gas at ground level, contributing to respiratory issues and aggravating asthma.",
76
  unsafe_allow_html=True,
77
  )
78
  with sub2:
79
- pollution_box(label="NO<sub>2</sub>", value=f"{round(today['NO2'])} µg/m³", delta=f"{round(int(today['NO2']) - int(previous_day['NO2']))} µg/m³")
 
 
 
 
80
  with st.expander("Learn more about O3", expanded=False):
81
  st.markdown(
82
  "*Wadeva particle (NO<sub>2</sub>)*: A harmful gas at ground level, contributing to respiratory issues and aggravating asthma.",
 
3
  import plotly.graph_objects as go
4
  import streamlit as st
5
 
 
6
  from src.helper_functions import custom_metric_box, pollution_box
7
+ from src.predict import get_data_and_predictions
8
 
9
  st.set_page_config(
10
  page_title="Utrecht Pollution Dashboard",
 
15
 
16
  alt.themes.enable("dark")
17
 
18
+ week_data, predictions_O3, predictions_NO2 = get_data_and_predictions()
19
+ today = week_data.iloc[-1]
20
+ previous_day = week_data.iloc[-2]
 
 
 
 
21
 
22
  dates_past = pd.date_range(end=pd.Timestamp.today(), periods=8).to_list()
23
+ dates_future = pd.date_range(
24
+ start=pd.Timestamp.today() + pd.Timedelta(days=1), periods=3
25
+ ).to_list()
26
 
27
  # O3 and NO2 values for the past 7 days
28
+ o3_past_values = week_data["O3"]
29
+ no2_past_values = week_data["NO2"]
30
+ o3_future_values = pd.Series(predictions_O3[0].flatten())
31
+ no2_future_values = pd.Series(predictions_NO2[0].flatten())
 
 
 
 
32
  o3_values = pd.concat([o3_past_values, o3_future_values], ignore_index=True)
33
  no2_values = pd.concat([no2_past_values, no2_future_values], ignore_index=True)
34
 
 
35
  dates = dates_past + dates_future
 
 
36
  df = pd.DataFrame({"Date": dates, "O3": o3_values, "NO2": no2_values})
37
 
38
 
 
45
  st.subheader("Current Weather")
46
  subcol1, subcol2 = st.columns((1, 1))
47
  with subcol1:
48
+ custom_metric_box(
49
+ label="Temperature",
50
+ value=f"{round(today['mean_temp'] * 0.1)} °C",
51
+ delta=f"{round(today['mean_temp'] * 0.1) - round(previous_day['mean_temp'] * 0.1)} °C",
52
+ )
53
+ custom_metric_box(
54
+ label="Humidity",
55
+ value=f"{round(today['humidity'])} %",
56
+ delta=f"{round(today['humidity']) - round(previous_day['humidity'])} %",
57
+ )
58
+ custom_metric_box(
59
+ label="Pressure",
60
+ value=f"{round(today['pressure'] * 0.1)} hPa",
61
+ delta=f"{round(today['pressure'] * 0.1) - round(previous_day['pressure'] * 0.1)} hPa",
62
+ )
63
  with subcol2:
64
+ custom_metric_box(
65
+ label="Precipitation",
66
+ value=f"{round(today['percipitation'] * 0.1)} mm",
67
+ delta=f"{round(today['percipitation'] * 0.1) - round(previous_day['percipitation'] * 0.1)} mm",
68
+ )
69
+ custom_metric_box(
70
+ label="Solar Radiation",
71
+ value=f"{round(today['global_radiation'])} J/m²",
72
+ delta=f"{round(today['global_radiation']) - round(previous_day['global_radiation'])} J/m²",
73
+ )
74
+ custom_metric_box(
75
+ label="Wind Speed",
76
+ value=f"{round(today['wind_speed'] * 0.1, 1)} m/s",
77
+ delta=f"{round(today['wind_speed'] * 0.1, 1) - round(previous_day['wind_speed'] * 0.1, 1)} m/s",
78
+ )
79
 
80
  with col2:
81
  st.subheader("Current Pollution Levels")
 
83
  # Display the prediction
84
  # st.write(f'Predicted Pollution Level: {prediction[0]:.2f}')
85
  with sub1:
86
+ pollution_box(
87
+ label="O<sub>3</sub>",
88
+ value=f"{round(today['O3'])} µg/m³",
89
+ delta=f"{round(int(today['O3']) - int(previous_day['O3']))} µg/m³",
90
+ )
91
  with st.expander("Learn more about O3", expanded=False):
92
  st.markdown(
93
  "*Ozone (O<sub>3</sub>)*: A harmful gas at ground level, contributing to respiratory issues and aggravating asthma.",
94
  unsafe_allow_html=True,
95
  )
96
  with sub2:
97
+ pollution_box(
98
+ label="NO<sub>2</sub>",
99
+ value=f"{round(today['NO2'])} µg/m³",
100
+ delta=f"{round(int(today['NO2']) - int(previous_day['NO2']))} µg/m³",
101
+ )
102
  with st.expander("Learn more about O3", expanded=False):
103
  st.markdown(
104
  "*Wadeva particle (NO<sub>2</sub>)*: A harmful gas at ground level, contributing to respiratory issues and aggravating asthma.",
data_api_calls.py DELETED
@@ -1,191 +0,0 @@
1
- import codecs
2
- import csv
3
- import http.client
4
- import os
5
- import re
6
- import sys
7
- import urllib.request
8
- from datetime import date, timedelta
9
- from io import StringIO
10
-
11
- import pandas as pd
12
-
13
-
14
- def pollution_data():
15
- particles = ["NO2", "O3"]
16
- stations = ["NL10636", "NL10639", "NL10643"]
17
- all_dataframes = []
18
- today = date.today().isoformat() + "T09:00:00Z"
19
- yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
20
- latest_date = (date.today() - timedelta(8)).isoformat() + "T09:00:00Z"
21
- days_today = 0
22
- days_yesterday = 1
23
- while(today != latest_date):
24
- days_today += 1
25
- days_yesterday += 1
26
- for particle in particles:
27
- for station in stations:
28
- conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
29
- payload = ''
30
- headers = {}
31
- conn.request("GET", f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}", payload, headers)
32
- res = conn.getresponse()
33
- data = res.read()
34
- decoded_data = data.decode("utf-8")
35
- df = pd.read_csv(StringIO(decoded_data))
36
- df = df.filter(like='value')
37
- all_dataframes.append(df)
38
- combined_data = pd.concat(all_dataframes, ignore_index=True)
39
- combined_data.to_csv(f'{particle}_{today}.csv', index=False)
40
- today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z"
41
- yesterday = (date.today() - timedelta(days_yesterday)).isoformat() + "T09:00:00Z"
42
-
43
- def delete_csv(csvs):
44
- for csv in csvs:
45
- if(os.path.exists(csv) and os.path.isfile(csv)):
46
- os.remove(csv)
47
-
48
- def clean_values():
49
- particles = ["NO2", "O3"]
50
- csvs = []
51
- NO2 = []
52
- O3 = []
53
- today = date.today().isoformat() + "T09:00:00Z"
54
- yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
55
- latest_date = (date.today() - timedelta(8)).isoformat() + "T09:00:00Z"
56
- days_today = 0
57
- while(today != latest_date):
58
- for particle in particles:
59
- name = f'{particle}_{today}.csv'
60
- csvs.append(name)
61
- days_today += 1
62
- today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z"
63
- for csv_file in csvs:
64
- values = [] # Reset values for each CSV file
65
- # Open the CSV file and read the values
66
- with open(csv_file, 'r') as file:
67
- reader = csv.reader(file)
68
- for row in reader:
69
- for value in row:
70
- # Use regular expressions to extract numeric part
71
- cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", value)
72
- if cleaned_value: # If we successfully extract a number
73
- values.append(float(cleaned_value[0])) # Convert the first match to float
74
-
75
- # Compute the average if the values list is not empty
76
- if values:
77
- avg = sum(values) / len(values)
78
- if "NO2" in csv_file:
79
- NO2.append(avg)
80
- else:
81
- O3.append(avg)
82
-
83
- delete_csv(csvs)
84
-
85
- return NO2, O3
86
-
87
-
88
- def add_columns():
89
- file_path = 'weather_data.csv'
90
- df = pd.read_csv(file_path)
91
-
92
- df.insert(1, 'NO2', None)
93
- df.insert(2, 'O3', None)
94
- df.insert(10, 'weekday', None)
95
-
96
- return df
97
-
98
-
99
- def scale(data):
100
- df = data
101
- columns = list(df.columns)
102
-
103
-
104
- columns.insert(3, columns.pop(6))
105
-
106
- df = df[columns]
107
-
108
- columns.insert(5, columns.pop(9))
109
-
110
- df = df[columns]
111
-
112
- columns.insert(9, columns.pop(6))
113
-
114
- df = df[columns]
115
-
116
- df = df.rename(columns={
117
- 'datetime':'date',
118
- 'windspeed': 'wind_speed',
119
- 'temp': 'mean_temp',
120
- 'solarradiation':'global_radiation',
121
- 'precip':'percipitation',
122
- 'sealevelpressure':'pressure',
123
- 'visibility':'minimum_visibility'
124
- })
125
-
126
- df['date'] = pd.to_datetime(df['date'])
127
- df['weekday'] = df['date'].dt.day_name()
128
-
129
-
130
- df['wind_speed'] = (df['wind_speed'] / 3.6) * 10
131
- df['mean_temp'] = df['mean_temp'] * 10
132
- df['minimum_visibility'] = df['minimum_visibility'] * 10
133
- df['percipitation'] = df['percipitation'] * 10
134
- df['pressure'] = df['pressure'] * 10
135
-
136
- df['wind_speed'] = df['wind_speed'].astype(int)
137
- df['mean_temp'] = df['mean_temp'].astype(int)
138
- df['minimum_visibility'] = df['minimum_visibility'].astype(int)
139
- df['percipitation'] = df['percipitation'].astype(int)
140
- df['pressure'] = df['pressure'].astype(int)
141
- df['humidity'] = df['humidity'].astype(int)
142
- df['global_radiation'] = df['global_radiation'].astype(int)
143
-
144
- return df
145
-
146
- def insert_pollution(NO2, O3, data):
147
- df = data
148
- start_index = 0
149
- while NO2:
150
- df.loc[start_index, 'NO2'] = NO2.pop()
151
- start_index += 1
152
- start_index = 0
153
- while O3:
154
- df.loc[start_index, 'O3'] = O3.pop()
155
- start_index += 1
156
- return df
157
-
158
- def weather_data():
159
- today = date.today().isoformat()
160
- seven_days = (date.today() - timedelta(7)).isoformat()
161
- try:
162
- ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{seven_days}/{today}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv")
163
-
164
- # Parse the results as CSV
165
- CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8'))
166
- # Saving the CSV content to a file
167
- current_dir = os.path.dirname(os.path.realpath(__file__))
168
- file_path = os.path.join(current_dir, 'weather_data.csv')
169
- with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
170
- csv_writer = csv.writer(csvfile)
171
- csv_writer.writerows(CSVText)
172
-
173
- except urllib.error.HTTPError as e:
174
- ErrorInfo= e.read().decode()
175
- print('Error code: ', e.code, ErrorInfo)
176
- sys.exit()
177
- except urllib.error.URLError as e:
178
- ErrorInfo= e.read().decode()
179
- print('Error code: ', e.code,ErrorInfo)
180
- sys.exit()
181
-
182
-
183
- def get_data():
184
- weather_data()
185
- pollution_data()
186
- NO2, O3 = clean_values()
187
- df = add_columns()
188
- scaled_df = scale(df)
189
- output_df = insert_pollution(NO2, O3, scaled_df)
190
- os.remove('weather_data.csv')
191
- return output_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/admin.py CHANGED
@@ -1,8 +1,63 @@
1
- import streamlit as st
2
- import pandas as pd
3
  import numpy as np
4
- from sklearn.linear_model import LinearRegression
5
- import joblib
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- # Title
8
- st.title("Admin Panel")
 
 
 
 
 
 
1
  import numpy as np
2
+ import pandas as pd
3
+ import streamlit as st
4
+
5
+ USERNAME = "admin"
6
+ PASSWORD = "password"
7
+
8
+ st.title("Admin Panel")
9
+
10
+ # Login Form
11
+ login_success = False
12
+ with st.form("login_form"):
13
+ st.write("Please login to access the admin dashboard:")
14
+ username = st.text_input("Username")
15
+ password = st.text_input("Password", type="password")
16
+ login_button = st.form_submit_button("Login")
17
+
18
+ if login_button:
19
+ if username == USERNAME and password == PASSWORD:
20
+ login_success = True
21
+ st.success("Login successful!")
22
+ else:
23
+ st.error("Invalid username or password.")
24
+
25
+ # After successful login
26
+ if login_success:
27
+ # Display information about model performance
28
+ st.header("Model Performance Metrics")
29
+
30
+ model_r2_score = 0.85 # Mock R^2 Score
31
+ avg_prediction_time = 0.15 # Mock Average Prediction Time in seconds
32
+ num_predictions_made = 2000 # Mock Number of Predictions Made
33
+
34
+ st.metric(label="R² Score", value=f"{model_r2_score:.2f}")
35
+ st.metric(
36
+ label="Average Prediction Time", value=f"{avg_prediction_time:.2f} seconds"
37
+ )
38
+ st.metric(label="Total Predictions Made", value=num_predictions_made)
39
+
40
+ st.subheader("Detailed Metrics")
41
+ detailed_metrics = pd.DataFrame(
42
+ {
43
+ "Metric": ["MAE", "MSE", "RMSE", "Training Time"],
44
+ "Value": [2.5, 3.4, 1.8, "1.2 hours"],
45
+ }
46
+ )
47
+ st.table(detailed_metrics)
48
+
49
+ # Mocking prediction latency over time (example chart)
50
+ st.subheader("Prediction Latency Over Time")
51
+ latency_data = pd.DataFrame(
52
+ {
53
+ "Date": pd.date_range(end=pd.Timestamp.today(), periods=7).to_list(),
54
+ "Prediction Time (s)": np.random.uniform(0.1, 0.5, 7),
55
+ }
56
+ )
57
+ st.line_chart(latency_data.set_index("Date"))
58
 
59
+ # Button to simulate refreshing metrics
60
+ if st.button("Refresh Metrics"):
61
+ st.experimental_rerun()
62
+ else:
63
+ st.warning("Please login to access the admin panel.")
pollution_data.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ date,NO2,O3
2
+ 2024-10-17,22.804605103280675,22.769159859976643
3
+ 2024-10-18,23.2685,23.30733245729302
4
+ 2024-10-19,23.91006441223834,23.1717142857143
5
+ 2024-10-20,22.573237547892735,23.53784452296821
6
+ 2024-10-21,21.1457004830918,24.020695652173934
7
+ 2024-10-22,21.776579804560274,23.33588571428572
8
+ 2024-10-23,21.974793814433,22.21468879668051
9
+ 2024-10-24,25.51256756756757,20.91370967741937
python.py DELETED
@@ -1,3 +0,0 @@
1
- from data_api_calls import get_data
2
-
3
- get_data()
 
 
 
 
src/data_api_calls.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import codecs
2
+ import csv
3
+ import http.client
4
+ import os
5
+ import re
6
+ import sys
7
+ import urllib.request
8
+ from datetime import date, timedelta
9
+ from io import StringIO
10
+
11
+ import pandas as pd
12
+
13
+ WEATHER_DATA_FILE = "weather_data.csv"
14
+ POLLUTION_DATA_FILE = "pollution_data.csv"
15
+
16
+
17
+ def update_weather_data():
18
+ today = date.today().isoformat()
19
+
20
+ if os.path.exists(WEATHER_DATA_FILE):
21
+ df = pd.read_csv(WEATHER_DATA_FILE)
22
+ last_date = pd.to_datetime(df["date"]).max()
23
+ start_date = (last_date + timedelta(1)).isoformat()
24
+ else:
25
+ df = pd.DataFrame()
26
+ start_date = (date.today() - timedelta(7)).isoformat()
27
+
28
+ try:
29
+ ResultBytes = urllib.request.urlopen(
30
+ f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{today}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
31
+ )
32
+ CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
33
+
34
+ new_data = pd.DataFrame(list(CSVText))
35
+ new_data.columns = new_data.iloc[0]
36
+ new_data = new_data[1:]
37
+ new_data = new_data.rename(columns={"datetime": "date"})
38
+
39
+ updated_df = pd.concat([df, new_data], ignore_index=True)
40
+ updated_df.drop_duplicates(subset="date", keep="last", inplace=True)
41
+ updated_df.to_csv(WEATHER_DATA_FILE, index=False)
42
+
43
+ except urllib.error.HTTPError as e:
44
+ ErrorInfo = e.read().decode()
45
+ print("Error code: ", e.code, ErrorInfo)
46
+ sys.exit()
47
+ except urllib.error.URLError as e:
48
+ ErrorInfo = e.read().decode()
49
+ print("Error code: ", e.code, ErrorInfo)
50
+ sys.exit()
51
+
52
+
53
+ def update_pollution_data():
54
+ O3 = []
55
+ NO2 = []
56
+ particles = ["NO2", "O3"]
57
+ stations = ["NL10636", "NL10639", "NL10643"]
58
+ all_dataframes = []
59
+ today = date.today().isoformat() + "T09:00:00Z"
60
+ yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
61
+ latest_date = (date.today() - timedelta(8)).isoformat() + "T09:00:00Z"
62
+ days_today = 0
63
+ days_yesterday = 1
64
+ while today != latest_date:
65
+ days_today += 1
66
+ days_yesterday += 1
67
+ for particle in particles:
68
+ for station in stations:
69
+ conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
70
+ payload = ""
71
+ headers = {}
72
+ conn.request(
73
+ "GET",
74
+ f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
75
+ payload,
76
+ headers,
77
+ )
78
+ res = conn.getresponse()
79
+ data = res.read()
80
+ decoded_data = data.decode("utf-8")
81
+ df = pd.read_csv(StringIO(decoded_data))
82
+ df = df.filter(like="value")
83
+ all_dataframes.append(df)
84
+ combined_data = pd.concat(all_dataframes, ignore_index=True)
85
+ values = []
86
+ for row in combined_data:
87
+ cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
88
+ if cleaned_value: # If we successfully extract a number
89
+ values.append(
90
+ float(cleaned_value[0])
91
+ ) # Convert the first match to float
92
+
93
+ # Compute the average if the values list is not empty
94
+ if values:
95
+ avg = sum(values) / len(values)
96
+ if particle == "NO2":
97
+ NO2.append(avg)
98
+ else:
99
+ O3.append(avg)
100
+ today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z"
101
+ yesterday = (
102
+ date.today() - timedelta(days_yesterday)
103
+ ).isoformat() + "T09:00:00Z"
104
+
105
+ avg_combined_data = pd.DataFrame(
106
+ {
107
+ "date": pd.date_range(end=date.today(), periods=len(NO2)),
108
+ "NO2": NO2,
109
+ "O3": O3,
110
+ }
111
+ )
112
+
113
+ avg_combined_data = reverse_pollution(NO2, O3, avg_combined_data)
114
+
115
+ if os.path.exists(POLLUTION_DATA_FILE):
116
+ existing_data = pd.read_csv(POLLUTION_DATA_FILE)
117
+ last_date = pd.to_datetime(existing_data["date"]).max()
118
+ new_data = avg_combined_data[avg_combined_data["date"] > last_date]
119
+ updated_data = pd.concat([existing_data, new_data], ignore_index=True)
120
+ updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
121
+ else:
122
+ updated_data = avg_combined_data
123
+
124
+ updated_data.to_csv(POLLUTION_DATA_FILE, index=False)
125
+
126
+
127
+ def reverse_pollution(NO2, O3, data):
128
+ df = data
129
+ start_index = 0
130
+ while NO2:
131
+ df.loc[start_index, "NO2"] = NO2.pop()
132
+ start_index += 1
133
+ start_index = 0
134
+ while O3:
135
+ df.loc[start_index, "O3"] = O3.pop()
136
+ start_index += 1
137
+ return df
138
+
139
+
140
+ def get_combined_data():
141
+ update_weather_data()
142
+ update_pollution_data()
143
+
144
+ weather_df = pd.read_csv(WEATHER_DATA_FILE)
145
+ pollution_df = pd.read_csv(POLLUTION_DATA_FILE)
146
+
147
+ # Average NO2 and O3 values by date and add them to weather data
148
+ combined_df = pd.merge(weather_df, pollution_df, on="date", how="left")
149
+ combined_df.fillna(0, inplace=True)
150
+
151
+ # Apply scaling and renaming similar to the scale function from previous code
152
+ combined_df = combined_df.rename(
153
+ columns={
154
+ "date": "date",
155
+ "windspeed": "wind_speed",
156
+ "temp": "mean_temp",
157
+ "solarradiation": "global_radiation",
158
+ "precip": "percipitation",
159
+ "sealevelpressure": "pressure",
160
+ "visibility": "minimum_visibility",
161
+ }
162
+ )
163
+
164
+ combined_df["date"] = pd.to_datetime(combined_df["date"])
165
+ combined_df["weekday"] = combined_df["date"].dt.day_name()
166
+
167
+ combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
168
+ combined_df["mean_temp"] = combined_df["mean_temp"] * 10
169
+ combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
170
+ combined_df["percipitation"] = combined_df["percipitation"] * 10
171
+ combined_df["pressure"] = combined_df["pressure"] * 10
172
+
173
+ combined_df["wind_speed"] = combined_df["wind_speed"].astype(int)
174
+ combined_df["mean_temp"] = combined_df["mean_temp"].astype(int)
175
+ combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(int)
176
+ combined_df["percipitation"] = combined_df["percipitation"].astype(int)
177
+ combined_df["pressure"] = combined_df["pressure"].astype(int)
178
+ combined_df["humidity"] = combined_df["humidity"].astype(int)
179
+ combined_df["global_radiation"] = combined_df["global_radiation"].astype(int)
180
+
181
+ return combined_df
src/{data_loading.py → features_pipeline.py} RENAMED
@@ -1,8 +1,12 @@
 
 
1
  import joblib
2
  import numpy as np
3
  import pandas as pd
4
 
5
- from past_data_api_calls import get_past_data
 
 
6
 
7
 
8
  def create_features(
@@ -11,37 +15,6 @@ def create_features(
11
  lag_days=7,
12
  sma_days=7,
13
  ):
14
- """
15
- Creates lagged features, SMA features, last year's particle data (NO2 and O3) for specific days,
16
- sine and cosine transformations for 'weekday' and 'month', and target variables for the specified
17
- particle ('O3' or 'NO2') for the next 'days_ahead' days. Scales features and targets without
18
- disregarding outliers and saves the scalers for inverse scaling. Splits the data into train,
19
- validation, and test sets using the most recent dates. Prints the number of rows with missing
20
- values dropped from the dataset.
21
-
22
- Parameters:
23
- - data (pd.DataFrame): The input time-series dataset.
24
- - target_particle (str): The target particle ('O3' or 'NO2') for which targets are created.
25
- - lag_days (int): Number of lag days to create features for (default 7).
26
- - sma_days (int): Window size for Simple Moving Average (default 7).
27
- - days_ahead (int): Number of days ahead to create target variables for (default 3).
28
-
29
- Returns:
30
- - X_train_scaled (pd.DataFrame): Scaled training features.
31
- - y_train_scaled (pd.DataFrame): Scaled training targets.
32
- - X_val_scaled (pd.DataFrame): Scaled validation features (365 days).
33
- - y_val_scaled (pd.DataFrame): Scaled validation targets (365 days).
34
- - X_test_scaled (pd.DataFrame): Scaled test features (365 days).
35
- - y_test_scaled (pd.DataFrame): Scaled test targets (365 days).
36
- """
37
- import warnings
38
-
39
- import numpy as np
40
- import pandas as pd
41
- from sklearn.preprocessing import StandardScaler
42
-
43
- warnings.filterwarnings("ignore")
44
-
45
  lag_features = [
46
  "NO2",
47
  "O3",
@@ -70,9 +43,7 @@ def create_features(
70
  # Create sine and cosine transformations for 'weekday' and 'month'
71
  data["weekday_sin"] = np.sin(2 * np.pi * data["weekday"] / 7)
72
  data["weekday_cos"] = np.cos(2 * np.pi * data["weekday"] / 7)
73
- data["month_sin"] = np.sin(
74
- 2 * np.pi * (data["month"] - 1) / 12
75
- ) # Adjust month to 0-11
76
  data["month_cos"] = np.cos(2 * np.pi * (data["month"] - 1) / 12)
77
 
78
  # Create lagged features for the specified lag days
@@ -86,32 +57,26 @@ def create_features(
86
  data[feature].rolling(window=sma_days).mean()
87
  )
88
 
89
- past_data = get_past_data()
90
  # Create particle data (NO2 and O3) from the same time last year
91
- # Today last year
92
 
93
- data["O3_last_year"] = past_data["O3"].iloc[-4] # data["O3_last_year"] = data["O3"].shift(365)
94
- data["NO2_last_year"] = past_data["NO2"].iloc[-4] # data["NO2_last_year"] = data["NO2"].shift(365)
 
95
 
96
  # 7 days before today last year
97
- for i in range(1, lag_days+1):
98
- data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i-1] # data["O3"].shift(365 + i)
99
- data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i-1] # data["NO2"].shift(365 + i)
100
 
101
  # 3 days after today last year
102
- data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1] # data["O3"].shift(365 - 3)
103
- data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1] # data["NO2"].shift(365 - 3)
104
-
105
- # Calculate the number of rows before dropping missing values
106
- rows_before = data.shape[0]
107
 
108
  # Drop missing values
 
109
  data = data.dropna().reset_index(drop=True)
110
-
111
- # Calculate the number of rows after dropping missing values
112
  rows_after = data.shape[0]
113
-
114
- # Calculate and print the number of rows dropped
115
  rows_dropped = rows_before - rows_after
116
  print(f"Number of rows with missing values dropped: {rows_dropped}")
117
 
@@ -125,16 +90,11 @@ def create_features(
125
  # Split features and targets
126
  x = data[feature_cols]
127
 
128
-
129
- # Initialize scalers
130
  feature_scaler = joblib.load(f"scalers/feature_scaler_{target_particle}.joblib")
131
-
132
- # Fit the scalers on the training data
133
  X_scaled = feature_scaler.transform(x)
134
 
135
  # Convert scaled data back to DataFrame for consistency
136
- X_scaled = pd.DataFrame(
137
- X_scaled, columns=feature_cols, index=x.index
138
- )
139
 
140
  return X_scaled
 
1
+ import warnings
2
+
3
  import joblib
4
  import numpy as np
5
  import pandas as pd
6
 
7
+ from src.past_data_api_calls import get_past_combined_data
8
+
9
+ warnings.filterwarnings("ignore")
10
 
11
 
12
  def create_features(
 
15
  lag_days=7,
16
  sma_days=7,
17
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  lag_features = [
19
  "NO2",
20
  "O3",
 
43
  # Create sine and cosine transformations for 'weekday' and 'month'
44
  data["weekday_sin"] = np.sin(2 * np.pi * data["weekday"] / 7)
45
  data["weekday_cos"] = np.cos(2 * np.pi * data["weekday"] / 7)
46
+ data["month_sin"] = np.sin(2 * np.pi * (data["month"] - 1) / 12)
 
 
47
  data["month_cos"] = np.cos(2 * np.pi * (data["month"] - 1) / 12)
48
 
49
  # Create lagged features for the specified lag days
 
57
  data[feature].rolling(window=sma_days).mean()
58
  )
59
 
 
60
  # Create particle data (NO2 and O3) from the same time last year
61
+ past_data = get_past_combined_data()
62
 
63
+ # Today last year
64
+ data["O3_last_year"] = past_data["O3"].iloc[-4]
65
+ data["NO2_last_year"] = past_data["NO2"].iloc[-4]
66
 
67
  # 7 days before today last year
68
+ for i in range(1, lag_days + 1):
69
+ data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i - 1]
70
+ data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i - 1]
71
 
72
  # 3 days after today last year
73
+ data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1]
74
+ data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1]
 
 
 
75
 
76
  # Drop missing values
77
+ rows_before = data.shape[0]
78
  data = data.dropna().reset_index(drop=True)
 
 
79
  rows_after = data.shape[0]
 
 
80
  rows_dropped = rows_before - rows_after
81
  print(f"Number of rows with missing values dropped: {rows_dropped}")
82
 
 
90
  # Split features and targets
91
  x = data[feature_cols]
92
 
93
+ # Scale
 
94
  feature_scaler = joblib.load(f"scalers/feature_scaler_{target_particle}.joblib")
 
 
95
  X_scaled = feature_scaler.transform(x)
96
 
97
  # Convert scaled data back to DataFrame for consistency
98
+ X_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=x.index)
 
 
99
 
100
  return X_scaled
past_data_api_calls.py → src/past_data_api_calls copy.py RENAMED
@@ -17,7 +17,9 @@ def pollution_data():
17
  last_year_date = date.today() - timedelta(days=365)
18
  start_date = last_year_date - timedelta(days=7)
19
  end_date = last_year_date + timedelta(days=3)
20
- date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
 
 
21
  for current_date in date_list:
22
  today = current_date.isoformat() + "T09:00:00Z"
23
  yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
@@ -25,24 +27,31 @@ def pollution_data():
25
  all_dataframes = [] # Reset for each particle
26
  for station in stations:
27
  conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
28
- payload = ''
29
  headers = {}
30
- conn.request("GET", f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}", payload, headers)
 
 
 
 
 
31
  res = conn.getresponse()
32
  data = res.read()
33
  decoded_data = data.decode("utf-8")
34
  df = pd.read_csv(StringIO(decoded_data))
35
- df = df.filter(like='value')
36
  all_dataframes.append(df)
37
  if all_dataframes:
38
  combined_data = pd.concat(all_dataframes, ignore_index=True)
39
- combined_data.to_csv(f'{particle}_{today}.csv', index=False)
 
40
 
41
  def delete_csv(csvs):
42
  for csv_file in csvs:
43
- if(os.path.exists(csv_file) and os.path.isfile(csv_file)):
44
  os.remove(csv_file)
45
 
 
46
  def clean_values():
47
  particles = ["NO2", "O3"]
48
  csvs = []
@@ -51,25 +60,29 @@ def clean_values():
51
  last_year_date = date.today() - timedelta(days=365)
52
  start_date = last_year_date - timedelta(days=7)
53
  end_date = last_year_date + timedelta(days=3)
54
- date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
 
 
55
  for current_date in date_list:
56
  today = current_date.isoformat() + "T09:00:00Z"
57
  for particle in particles:
58
- name = f'{particle}_{today}.csv'
59
  csvs.append(name)
60
  for csv_file in csvs:
61
  if not os.path.exists(csv_file):
62
  continue # Skip if the file doesn't exist
63
  values = [] # Reset values for each CSV file
64
  # Open the CSV file and read the values
65
- with open(csv_file, 'r') as file:
66
  reader = csv.reader(file)
67
  for row in reader:
68
  for value in row:
69
  # Use regular expressions to extract numeric part
70
  cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", value)
71
  if cleaned_value: # If we successfully extract a number
72
- values.append(float(cleaned_value[0])) # Convert the first match to float
 
 
73
 
74
  # Compute the average if the values list is not empty
75
  if values:
@@ -81,16 +94,18 @@ def clean_values():
81
  delete_csv(csvs)
82
  return NO2, O3
83
 
 
84
  def add_columns():
85
- file_path = 'weather_data.csv'
86
  df = pd.read_csv(file_path)
87
 
88
- df.insert(1, 'NO2', None)
89
- df.insert(2, 'O3', None)
90
- df.insert(10, 'weekday', None)
91
 
92
  return df
93
 
 
94
  def scale(data):
95
  df = data
96
  columns = list(df.columns)
@@ -104,97 +119,72 @@ def scale(data):
104
  columns.insert(9, columns.pop(6))
105
  df = df[columns]
106
 
107
- df = df.rename(columns={
108
- 'datetime':'date',
109
- 'windspeed': 'wind_speed',
110
- 'temp': 'mean_temp',
111
- 'solarradiation':'global_radiation',
112
- 'precip':'percipitation',
113
- 'sealevelpressure':'pressure',
114
- 'visibility':'minimum_visibility'
115
- })
116
-
117
- df['date'] = pd.to_datetime(df['date'])
118
- df['weekday'] = df['date'].dt.day_name()
119
-
120
- df = df.sort_values(by='date').reset_index(drop=True)
121
-
122
- df['wind_speed'] = (df['wind_speed'] / 3.6) * 10
123
- df['mean_temp'] = df['mean_temp'] * 10
124
- df['minimum_visibility'] = df['minimum_visibility'] * 10
125
- df['percipitation'] = df['percipitation'] * 10
126
- df['pressure'] = df['pressure']
127
-
128
- df['wind_speed'] = df['wind_speed'].astype(int)
129
- df['mean_temp'] = df['mean_temp'].astype(int)
130
- df['minimum_visibility'] = df['minimum_visibility'].astype(int)
131
- df['percipitation'] = df['percipitation'].astype(int)
132
- df['pressure'] = df['pressure'].astype(int)
133
- df['humidity'] = df['humidity'].astype(int)
134
- df['global_radiation'] = df['global_radiation'].astype(int)
 
 
135
 
136
  return df
137
 
 
138
  def insert_pollution(NO2, O3, data):
139
  df = data
140
- df['NO2'] = NO2
141
- df['O3'] = O3
142
  return df
143
 
 
144
  def weather_data():
145
- # Get last year's same day
146
  last_year_date = date.today() - timedelta(days=365)
147
- # Start date is 7 days prior
148
  start_date = (last_year_date - timedelta(days=7)).isoformat()
149
- # End date is 3 days ahead
150
  end_date = (last_year_date + timedelta(days=3)).isoformat()
151
- try:
152
- ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv")
153
-
154
- # Parse the results as CSV
155
- CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8'))
156
- # Saving the CSV content to a file
157
- current_dir = os.path.dirname(os.path.realpath(__file__))
158
- file_path = os.path.join(current_dir, 'weather_data.csv')
159
- with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
160
- csv_writer = csv.writer(csvfile)
161
- csv_writer.writerows(CSVText)
162
-
163
- except urllib.error.HTTPError as e:
164
- ErrorInfo= e.read().decode()
165
- print('Error code: ', e.code, ErrorInfo)
166
- sys.exit()
167
- except urllib.error.URLError as e:
168
- ErrorInfo= e.read().decode()
169
- print('Error code: ', e.code,ErrorInfo)
170
- sys.exit()
171
 
172
- def weather_data():
173
- # Set up dates for last year: 7 days before today last year, and 3 days ahead of this day last year
174
- today_last_year = date.today() - timedelta(365)
175
- start_last_year = today_last_year - timedelta(8)
176
- end_last_year = today_last_year + timedelta(2)
177
-
178
- try:
179
- # API call with new date range for last year
180
- ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_last_year}/{end_last_year}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv")
181
-
182
  # Parse the results as CSV
183
- CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8'))
184
  # Saving the CSV content to a file
185
  current_dir = os.path.dirname(os.path.realpath(__file__))
186
- file_path = os.path.join(current_dir, 'weather_data.csv')
187
- with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
188
  csv_writer = csv.writer(csvfile)
189
  csv_writer.writerows(CSVText)
190
-
191
  except urllib.error.HTTPError as e:
192
- ErrorInfo = e.read().decode()
193
- print('Error code: ', e.code, ErrorInfo)
194
  sys.exit()
195
  except urllib.error.URLError as e:
196
- ErrorInfo = e.read().decode()
197
- print('Error code: ', e.code, ErrorInfo)
198
  sys.exit()
199
 
200
 
@@ -205,5 +195,5 @@ def get_past_data():
205
  df = add_columns()
206
  scaled_df = scale(df)
207
  output_df = insert_pollution(NO2, O3, scaled_df)
208
- os.remove('weather_data.csv')
209
- return output_df
 
17
  last_year_date = date.today() - timedelta(days=365)
18
  start_date = last_year_date - timedelta(days=7)
19
  end_date = last_year_date + timedelta(days=3)
20
+ date_list = [
21
+ start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
22
+ ]
23
  for current_date in date_list:
24
  today = current_date.isoformat() + "T09:00:00Z"
25
  yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
 
27
  all_dataframes = [] # Reset for each particle
28
  for station in stations:
29
  conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
30
+ payload = ""
31
  headers = {}
32
+ conn.request(
33
+ "GET",
34
+ f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
35
+ payload,
36
+ headers,
37
+ )
38
  res = conn.getresponse()
39
  data = res.read()
40
  decoded_data = data.decode("utf-8")
41
  df = pd.read_csv(StringIO(decoded_data))
42
+ df = df.filter(like="value")
43
  all_dataframes.append(df)
44
  if all_dataframes:
45
  combined_data = pd.concat(all_dataframes, ignore_index=True)
46
+ combined_data.to_csv(f"{particle}_{today}.csv", index=False)
47
+
48
 
49
  def delete_csv(csvs):
50
  for csv_file in csvs:
51
+ if os.path.exists(csv_file) and os.path.isfile(csv_file):
52
  os.remove(csv_file)
53
 
54
+
55
  def clean_values():
56
  particles = ["NO2", "O3"]
57
  csvs = []
 
60
  last_year_date = date.today() - timedelta(days=365)
61
  start_date = last_year_date - timedelta(days=7)
62
  end_date = last_year_date + timedelta(days=3)
63
+ date_list = [
64
+ start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
65
+ ]
66
  for current_date in date_list:
67
  today = current_date.isoformat() + "T09:00:00Z"
68
  for particle in particles:
69
+ name = f"{particle}_{today}.csv"
70
  csvs.append(name)
71
  for csv_file in csvs:
72
  if not os.path.exists(csv_file):
73
  continue # Skip if the file doesn't exist
74
  values = [] # Reset values for each CSV file
75
  # Open the CSV file and read the values
76
+ with open(csv_file, "r") as file:
77
  reader = csv.reader(file)
78
  for row in reader:
79
  for value in row:
80
  # Use regular expressions to extract numeric part
81
  cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", value)
82
  if cleaned_value: # If we successfully extract a number
83
+ values.append(
84
+ float(cleaned_value[0])
85
+ ) # Convert the first match to float
86
 
87
  # Compute the average if the values list is not empty
88
  if values:
 
94
  delete_csv(csvs)
95
  return NO2, O3
96
 
97
+
98
  def add_columns():
99
+ file_path = "weather_data.csv"
100
  df = pd.read_csv(file_path)
101
 
102
+ df.insert(1, "NO2", None)
103
+ df.insert(2, "O3", None)
104
+ df.insert(10, "weekday", None)
105
 
106
  return df
107
 
108
+
109
  def scale(data):
110
  df = data
111
  columns = list(df.columns)
 
119
  columns.insert(9, columns.pop(6))
120
  df = df[columns]
121
 
122
+ df = df.rename(
123
+ columns={
124
+ "datetime": "date",
125
+ "windspeed": "wind_speed",
126
+ "temp": "mean_temp",
127
+ "solarradiation": "global_radiation",
128
+ "precip": "percipitation",
129
+ "sealevelpressure": "pressure",
130
+ "visibility": "minimum_visibility",
131
+ }
132
+ )
133
+
134
+ df["date"] = pd.to_datetime(df["date"])
135
+ df["weekday"] = df["date"].dt.day_name()
136
+
137
+ df = df.sort_values(by="date").reset_index(drop=True)
138
+
139
+ df["wind_speed"] = (df["wind_speed"] / 3.6) * 10
140
+ df["mean_temp"] = df["mean_temp"] * 10
141
+ df["minimum_visibility"] = df["minimum_visibility"] * 10
142
+ df["percipitation"] = df["percipitation"] * 10
143
+ df["pressure"] = df["pressure"]
144
+
145
+ df["wind_speed"] = df["wind_speed"].astype(int)
146
+ df["mean_temp"] = df["mean_temp"].astype(int)
147
+ df["minimum_visibility"] = df["minimum_visibility"].astype(int)
148
+ df["percipitation"] = df["percipitation"].astype(int)
149
+ df["pressure"] = df["pressure"].astype(int)
150
+ df["humidity"] = df["humidity"].astype(int)
151
+ df["global_radiation"] = df["global_radiation"].astype(int)
152
 
153
  return df
154
 
155
+
156
  def insert_pollution(NO2, O3, data):
157
  df = data
158
+ df["NO2"] = NO2
159
+ df["O3"] = O3
160
  return df
161
 
162
+
163
  def weather_data():
 
164
  last_year_date = date.today() - timedelta(days=365)
 
165
  start_date = (last_year_date - timedelta(days=7)).isoformat()
 
166
  end_date = (last_year_date + timedelta(days=3)).isoformat()
167
+ try:
168
+ ResultBytes = urllib.request.urlopen(
169
+ f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
170
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
 
 
 
 
 
 
 
 
 
 
172
  # Parse the results as CSV
173
+ CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
174
  # Saving the CSV content to a file
175
  current_dir = os.path.dirname(os.path.realpath(__file__))
176
+ file_path = os.path.join(current_dir, "past_weather_data.csv")
177
+ with open(file_path, "w", newline="", encoding="utf-8") as csvfile:
178
  csv_writer = csv.writer(csvfile)
179
  csv_writer.writerows(CSVText)
180
+
181
  except urllib.error.HTTPError as e:
182
+ ErrorInfo = e.read().decode()
183
+ print("Error code: ", e.code, ErrorInfo)
184
  sys.exit()
185
  except urllib.error.URLError as e:
186
+ ErrorInfo = e.read().decode()
187
+ print("Error code: ", e.code, ErrorInfo)
188
  sys.exit()
189
 
190
 
 
195
  df = add_columns()
196
  scaled_df = scale(df)
197
  output_df = insert_pollution(NO2, O3, scaled_df)
198
+ os.remove("past_weather_data.csv")
199
+ return output_df
src/past_data_api_calls.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import codecs
2
+ import csv
3
+ import http.client
4
+ import re
5
+ import sys
6
+ import urllib.request
7
+ from datetime import date, timedelta
8
+ from io import StringIO
9
+
10
+ import pandas as pd
11
+
12
+ PAST_WEATHER_DATA_FILE = "weather_data.csv"
13
+ PAST_POLLUTION_DATA_FILE = "pollution_data.csv"
14
+
15
+
16
+ def get_past_weather_data():
17
+ last_year_date = date.today() - timedelta(days=365)
18
+ start_date = (last_year_date - timedelta(days=7)).isoformat()
19
+ end_date = (last_year_date + timedelta(days=3)).isoformat()
20
+
21
+ try:
22
+ ResultBytes = urllib.request.urlopen(
23
+ f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
24
+ )
25
+ CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
26
+
27
+ data = pd.DataFrame(list(CSVText))
28
+ data.columns = data.iloc[0]
29
+ data = data[1:]
30
+ data = data.rename(columns={"datetime": "date"})
31
+ return data
32
+
33
+ except urllib.error.HTTPError as e:
34
+ ErrorInfo = e.read().decode()
35
+ print("Error code: ", e.code, ErrorInfo)
36
+ sys.exit()
37
+ except urllib.error.URLError as e:
38
+ ErrorInfo = e.read().decode()
39
+ print("Error code: ", e.code, ErrorInfo)
40
+ sys.exit()
41
+
42
+
43
+ def get_past_pollution_data():
44
+ O3 = []
45
+ NO2 = []
46
+ particles = ["NO2", "O3"]
47
+ stations = ["NL10636", "NL10639", "NL10643"]
48
+ all_dataframes = []
49
+ last_year_date = date.today() - timedelta(days=365)
50
+ start_date = last_year_date - timedelta(days=7)
51
+ end_date = last_year_date + timedelta(days=3)
52
+ date_list = [
53
+ start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
54
+ ]
55
+ for current_date in date_list:
56
+ today = current_date.isoformat() + "T09:00:00Z"
57
+ yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
58
+ for particle in particles:
59
+ all_dataframes = [] # Reset for each particle
60
+ for station in stations:
61
+ conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
62
+ payload = ""
63
+ headers = {}
64
+ conn.request(
65
+ "GET",
66
+ f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
67
+ payload,
68
+ headers,
69
+ )
70
+ res = conn.getresponse()
71
+ data = res.read()
72
+ decoded_data = data.decode("utf-8")
73
+ df = pd.read_csv(StringIO(decoded_data))
74
+ df = df.filter(like="value")
75
+ all_dataframes.append(df)
76
+
77
+ combined_data = pd.concat(all_dataframes, ignore_index=True)
78
+ values = []
79
+ for row in combined_data:
80
+ cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
81
+ if cleaned_value:
82
+ values.append(float(cleaned_value[0]))
83
+
84
+ if values:
85
+ avg = sum(values) / len(values)
86
+ if particle == "NO2":
87
+ NO2.append(avg)
88
+ else:
89
+ O3.append(avg)
90
+
91
+ return NO2, O3
92
+
93
+
94
+ def get_past_combined_data():
95
+ weather_df = get_past_weather_data()
96
+ NO2_df, O3_df = get_past_pollution_data()
97
+
98
+ combined_df = weather_df
99
+ combined_df["NO2"] = NO2_df
100
+ combined_df["O3"] = O3_df
101
+
102
+ # Apply scaling and renaming similar to the scale function from previous code
103
+ combined_df = combined_df.rename(
104
+ columns={
105
+ "date": "date",
106
+ "windspeed": "wind_speed",
107
+ "temp": "mean_temp",
108
+ "solarradiation": "global_radiation",
109
+ "precip": "percipitation",
110
+ "sealevelpressure": "pressure",
111
+ "visibility": "minimum_visibility",
112
+ }
113
+ )
114
+
115
+ combined_df["date"] = pd.to_datetime(combined_df["date"])
116
+ combined_df["weekday"] = combined_df["date"].dt.day_name()
117
+
118
+ combined_df["wind_speed"] = combined_df["wind_speed"].astype(float)
119
+ combined_df["mean_temp"] = combined_df["mean_temp"].astype(float)
120
+ combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(float)
121
+ combined_df["percipitation"] = combined_df["percipitation"].astype(float)
122
+ combined_df["pressure"] = combined_df["pressure"].astype(float).round()
123
+ combined_df["humidity"] = combined_df["humidity"].astype(float).round()
124
+ combined_df["global_radiation"] = combined_df["global_radiation"].astype(float)
125
+
126
+ combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
127
+ combined_df["mean_temp"] = combined_df["mean_temp"] * 10
128
+ combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
129
+ combined_df["percipitation"] = combined_df["percipitation"] * 10
130
+ combined_df["pressure"] = combined_df["pressure"] * 10
131
+
132
+ combined_df["wind_speed"] = combined_df["wind_speed"].astype(float).round().astype(int)
133
+ combined_df["mean_temp"] = combined_df["mean_temp"].astype(float).round().astype(int)
134
+ combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(float).round().astype(int)
135
+ combined_df["percipitation"] = combined_df["percipitation"].astype(float).round().astype(int)
136
+ combined_df["pressure"] = combined_df["pressure"].astype(float).round().astype(int)
137
+ combined_df["humidity"] = combined_df["humidity"].astype(float).round().astype(int)
138
+ combined_df["global_radiation"] = combined_df["global_radiation"].astype(float).round().astype(int)
139
+
140
+ return combined_df
src/{models_loading.py → predict.py} RENAMED
@@ -1,12 +1,15 @@
1
  import os
2
 
3
  import joblib
4
- import pandas as pd
5
  import streamlit as st
6
  from dotenv import load_dotenv
7
  from huggingface_hub import hf_hub_download, login
8
- from src.data_loading import create_features
9
 
 
 
 
 
 
10
  def load_model(particle):
11
  load_dotenv()
12
  login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
@@ -15,21 +18,30 @@ def load_model(particle):
15
  if particle == "O3":
16
  file_name = "O3_svr_model.pkl"
17
  elif particle == "NO2":
18
- file_name == "hehehe"
19
 
20
  model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
21
  model = joblib.load(model_path)
22
-
23
  return model
24
 
25
 
26
- @st.cache_resource(ttl=6 * 300) # Reruns every 6 hours
27
  def run_model(particle, data):
28
  input_data = create_features(data=data, target_particle=particle)
29
  model = load_model(particle)
30
-
31
- # Run the model with static input
32
  prediction = model.predict(input_data)
33
  target_scaler = joblib.load(f"scalers/target_scaler_{particle}.joblib")
34
  prediction = target_scaler.inverse_transform(prediction)
35
  return prediction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
 
3
  import joblib
 
4
  import streamlit as st
5
  from dotenv import load_dotenv
6
  from huggingface_hub import hf_hub_download, login
 
7
 
8
+ from src.data_api_calls import get_combined_data
9
+ from src.features_pipeline import create_features
10
+
11
+
12
+ @st.cache_resource()
13
  def load_model(particle):
14
  load_dotenv()
15
  login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
 
18
  if particle == "O3":
19
  file_name = "O3_svr_model.pkl"
20
  elif particle == "NO2":
21
+ file_name == "NO2_nn_model.pkl"
22
 
23
  model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
24
  model = joblib.load(model_path)
 
25
  return model
26
 
27
 
 
28
  def run_model(particle, data):
29
  input_data = create_features(data=data, target_particle=particle)
30
  model = load_model(particle)
 
 
31
  prediction = model.predict(input_data)
32
  target_scaler = joblib.load(f"scalers/target_scaler_{particle}.joblib")
33
  prediction = target_scaler.inverse_transform(prediction)
34
  return prediction
35
+
36
+
37
+ def get_data_and_predictions():
38
+ PREDICTIONS_FILE = "predictions_history.csv"
39
+
40
+ week_data = get_combined_data()
41
+
42
+ o3_input_features = create_features(week_data, "O3")
43
+ no2_input_features = create_features(week_data, "NO2")
44
+ o3_predictions = run_model("O3", data=o3_input_features)
45
+ no2_predictions = run_model("NO2", data=no2_input_features)
46
+
47
+ return week_data, o3_predictions, no2_predictions
test.ipynb CHANGED
@@ -9,34 +9,63 @@
9
  "name": "stderr",
10
  "output_type": "stream",
11
  "text": [
12
- "/Users/mihkelmariuszjezierski/anaconda3/envs/ml-industry/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
  " from .autonotebook import tqdm as notebook_tqdm\n"
14
  ]
15
  }
16
  ],
17
  "source": [
18
- "from src.models_loading import run_model\n",
19
- "from data_api_calls import get_data\n",
20
- "import pandas as pd\n",
21
- "from past_data_api_calls import get_past_data"
22
  ]
23
  },
24
  {
25
  "cell_type": "code",
26
- "execution_count": 2,
27
  "metadata": {},
28
  "outputs": [],
29
  "source": [
30
- "from src.data_loading import create_features"
31
  ]
32
  },
33
  {
34
  "cell_type": "code",
35
- "execution_count": 3,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "metadata": {},
37
  "outputs": [],
38
  "source": [
39
- "df = get_data()"
40
  ]
41
  },
42
  {
@@ -672,7 +701,7 @@
672
  ],
673
  "metadata": {
674
  "kernelspec": {
675
- "display_name": "ml-industry",
676
  "language": "python",
677
  "name": "python3"
678
  },
@@ -686,7 +715,7 @@
686
  "name": "python",
687
  "nbconvert_exporter": "python",
688
  "pygments_lexer": "ipython3",
689
- "version": "3.12.5"
690
  }
691
  },
692
  "nbformat": 4,
 
9
  "name": "stderr",
10
  "output_type": "stream",
11
  "text": [
12
+ "c:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
  " from .autonotebook import tqdm as notebook_tqdm\n"
14
  ]
15
  }
16
  ],
17
  "source": [
18
+ "from src.data_api_calls import get_combined_data\n",
19
+ "from src.past_data_api_calls import get_past_combined_data\n",
20
+ "from src.predict import get_data_and_predictions"
 
21
  ]
22
  },
23
  {
24
  "cell_type": "code",
25
+ "execution_count": null,
26
  "metadata": {},
27
  "outputs": [],
28
  "source": [
29
+ "get_past_combined_data()"
30
  ]
31
  },
32
  {
33
  "cell_type": "code",
34
+ "execution_count": 2,
35
+ "metadata": {},
36
+ "outputs": [
37
+ {
38
+ "ename": "OSError",
39
+ "evalue": "[Errno 22] Invalid argument: 'NO2_2023-10-18T09:00:00Z.csv'",
40
+ "output_type": "error",
41
+ "traceback": [
42
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
43
+ "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
44
+ "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m week_data, predictions_O3, predictions_NO2 \u001b[38;5;241m=\u001b[39m \u001b[43mget_data_and_predictions\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
45
+ "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\predict.py:42\u001b[0m, in \u001b[0;36mget_data_and_predictions\u001b[1;34m()\u001b[0m\n\u001b[0;32m 38\u001b[0m PREDICTIONS_FILE \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpredictions_history.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 40\u001b[0m week_data \u001b[38;5;241m=\u001b[39m get_combined_data()\n\u001b[1;32m---> 42\u001b[0m o3_input_features \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweek_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mO3\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 43\u001b[0m no2_input_features \u001b[38;5;241m=\u001b[39m create_features(week_data, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNO2\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 44\u001b[0m o3_predictions \u001b[38;5;241m=\u001b[39m run_model(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO3\u001b[39m\u001b[38;5;124m\"\u001b[39m, data\u001b[38;5;241m=\u001b[39mo3_input_features)\n",
46
+ "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\features_pipeline.py:61\u001b[0m, in \u001b[0;36mcreate_features\u001b[1;34m(data, target_particle, lag_days, sma_days)\u001b[0m\n\u001b[0;32m 56\u001b[0m data[\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfeature\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_sma_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msma_days\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 57\u001b[0m data[feature]\u001b[38;5;241m.\u001b[39mrolling(window\u001b[38;5;241m=\u001b[39msma_days)\u001b[38;5;241m.\u001b[39mmean()\n\u001b[0;32m 58\u001b[0m )\n\u001b[0;32m 60\u001b[0m \u001b[38;5;66;03m# Create particle data (NO2 and O3) from the same time last year\u001b[39;00m\n\u001b[1;32m---> 61\u001b[0m past_data \u001b[38;5;241m=\u001b[39m \u001b[43mget_past_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 63\u001b[0m \u001b[38;5;66;03m# Today last year\u001b[39;00m\n\u001b[0;32m 64\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO3_last_year\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m past_data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO3\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39miloc[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m4\u001b[39m]\n",
47
+ "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\past_data_api_calls.py:193\u001b[0m, in \u001b[0;36mget_past_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m 191\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_past_data\u001b[39m():\n\u001b[0;32m 192\u001b[0m weather_data()\n\u001b[1;32m--> 193\u001b[0m \u001b[43mpollution_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 194\u001b[0m NO2, O3 \u001b[38;5;241m=\u001b[39m clean_values()\n\u001b[0;32m 195\u001b[0m df \u001b[38;5;241m=\u001b[39m add_columns()\n",
48
+ "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\past_data_api_calls.py:46\u001b[0m, in \u001b[0;36mpollution_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m 44\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m all_dataframes:\n\u001b[0;32m 45\u001b[0m combined_data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat(all_dataframes, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m---> 46\u001b[0m \u001b[43mcombined_data\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mparticle\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m_\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mtoday\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
49
+ "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\util\\_decorators.py:333\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 327\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m 328\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 329\u001b[0m msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m 330\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m 331\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 332\u001b[0m )\n\u001b[1;32m--> 333\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
50
+ "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:3967\u001b[0m, in \u001b[0;36mNDFrame.to_csv\u001b[1;34m(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, lineterminator, chunksize, date_format, doublequote, escapechar, decimal, errors, storage_options)\u001b[0m\n\u001b[0;32m 3956\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m, ABCDataFrame) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mto_frame()\n\u001b[0;32m 3958\u001b[0m formatter \u001b[38;5;241m=\u001b[39m DataFrameFormatter(\n\u001b[0;32m 3959\u001b[0m frame\u001b[38;5;241m=\u001b[39mdf,\n\u001b[0;32m 3960\u001b[0m header\u001b[38;5;241m=\u001b[39mheader,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 3964\u001b[0m decimal\u001b[38;5;241m=\u001b[39mdecimal,\n\u001b[0;32m 3965\u001b[0m )\n\u001b[1;32m-> 3967\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mDataFrameRenderer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mformatter\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_csv\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 3968\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath_or_buf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3969\u001b[0m \u001b[43m \u001b[49m\u001b[43mlineterminator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlineterminator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3970\u001b[0m \u001b[43m \u001b[49m\u001b[43msep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3971\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3972\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3973\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcompression\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3974\u001b[0m \u001b[43m \u001b[49m\u001b[43mquoting\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquoting\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3975\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3976\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_label\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_label\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3977\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3978\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunksize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3979\u001b[0m \u001b[43m \u001b[49m\u001b[43mquotechar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquotechar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3980\u001b[0m \u001b[43m \u001b[49m\u001b[43mdate_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdate_format\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3981\u001b[0m \u001b[43m \u001b[49m\u001b[43mdoublequote\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdoublequote\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3982\u001b[0m \u001b[43m \u001b[49m\u001b[43mescapechar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mescapechar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3983\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3984\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
51
+ "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\io\\formats\\format.py:1014\u001b[0m, in \u001b[0;36mDataFrameRenderer.to_csv\u001b[1;34m(self, path_or_buf, encoding, sep, columns, index_label, mode, compression, quoting, quotechar, lineterminator, chunksize, date_format, doublequote, escapechar, errors, storage_options)\u001b[0m\n\u001b[0;32m 993\u001b[0m created_buffer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m 995\u001b[0m csv_formatter \u001b[38;5;241m=\u001b[39m CSVFormatter(\n\u001b[0;32m 996\u001b[0m path_or_buf\u001b[38;5;241m=\u001b[39mpath_or_buf,\n\u001b[0;32m 997\u001b[0m lineterminator\u001b[38;5;241m=\u001b[39mlineterminator,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1012\u001b[0m formatter\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfmt,\n\u001b[0;32m 1013\u001b[0m )\n\u001b[1;32m-> 1014\u001b[0m \u001b[43mcsv_formatter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1016\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m created_buffer:\n\u001b[0;32m 1017\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(path_or_buf, StringIO)\n",
52
+ "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\io\\formats\\csvs.py:251\u001b[0m, in \u001b[0;36mCSVFormatter.save\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 247\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 248\u001b[0m \u001b[38;5;124;03mCreate the writer & save.\u001b[39;00m\n\u001b[0;32m 249\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 250\u001b[0m \u001b[38;5;66;03m# apply compression and byte/text conversion\u001b[39;00m\n\u001b[1;32m--> 251\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 252\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 253\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 254\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 255\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 256\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompression\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 257\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 258\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m handles:\n\u001b[0;32m 259\u001b[0m \u001b[38;5;66;03m# Note: self.encoding is irrelevant here\u001b[39;00m\n\u001b[0;32m 260\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwriter \u001b[38;5;241m=\u001b[39m csvlib\u001b[38;5;241m.\u001b[39mwriter(\n\u001b[0;32m 261\u001b[0m handles\u001b[38;5;241m.\u001b[39mhandle,\n\u001b[0;32m 262\u001b[0m lineterminator\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlineterminator,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 267\u001b[0m quotechar\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mquotechar,\n\u001b[0;32m 268\u001b[0m )\n\u001b[0;32m 270\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_save()\n",
53
+ "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\io\\common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[1;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[0;32m 874\u001b[0m handle,\n\u001b[0;32m 875\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[0;32m 876\u001b[0m encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[0;32m 877\u001b[0m errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[0;32m 878\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 879\u001b[0m )\n\u001b[0;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
54
+ "\u001b[1;31mOSError\u001b[0m: [Errno 22] Invalid argument: 'NO2_2023-10-18T09:00:00Z.csv'"
55
+ ]
56
+ }
57
+ ],
58
+ "source": [
59
+ "week_data, predictions_O3, predictions_NO2 = get_data_and_predictions()"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": null,
65
  "metadata": {},
66
  "outputs": [],
67
  "source": [
68
+ "week_data"
69
  ]
70
  },
71
  {
 
701
  ],
702
  "metadata": {
703
  "kernelspec": {
704
+ "display_name": ".venv",
705
  "language": "python",
706
  "name": "python3"
707
  },
 
715
  "name": "python",
716
  "nbconvert_exporter": "python",
717
  "pygments_lexer": "ipython3",
718
+ "version": "3.11.8"
719
  }
720
  },
721
  "nbformat": 4,
test.py DELETED
@@ -1,3 +0,0 @@
1
- from models_loading import run_model
2
-
3
-
 
 
 
 
weather_data.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ date,temp,humidity,precip,windspeed,sealevelpressure,visibility,solarradiation
2
+ 2024-10-17,16.9,86.0,0.6,18.4,1010.0,37.1,43.0
3
+ 2024-10-18,15.5,97.3,3.9,7.6,1014.0,4.5,42.9
4
+ 2024-10-19,14.7,89.9,1.6,14.8,1014.1,22.8,43.5
5
+ 2024-10-20,15.5,83.8,0.5,29.5,1016.0,41.5,0.0
6
+ 2024-10-21,14.4,92.7,4.3,21.2,1020.6,22.0,27.8
7
+ 2024-10-22,11.4,92.8,4.9,19.4,1026.9,22.6,57.0
8
+ 2024-10-23,11.2,97.3,0.0,13.0,1032.8,6.5,12.5
9
+ 2024-10-24,10.4,94.0,0.0,20.5,1024.7,13.0,62.5