Mihkelmj commited on
Commit
6a440fc
·
1 Parent(s): 2951a30

added daily_api_pollution.py and data_loading files to the repo; data_loading needs to be modified for inference

Browse files
Files changed (2) hide show
  1. daily_api__pollution.py +161 -0
  2. data_loading.py +276 -0
daily_api__pollution.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client
2
+ from datetime import date, timedelta
3
+ import pandas as pd
4
+ from io import StringIO
5
+ import os
6
+ import re
7
+ import csv
8
+
9
+ def api_call():
10
+ particles = ["NO2", "O3"]
11
+ stations = ["NL10636", "NL10639", "NL10643"]
12
+ all_dataframes = []
13
+ today = date.today().isoformat() + "T09:00:00Z"
14
+ yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
15
+ latest_date = (date.today() - timedelta(7)).isoformat() + "T09:00:00Z"
16
+ days_today = 0
17
+ days_yesterday = 1
18
+ while(today != latest_date):
19
+ days_today += 1
20
+ days_yesterday += 1
21
+ for particle in particles:
22
+ for station in stations:
23
+ conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
24
+ payload = ''
25
+ headers = {}
26
+ conn.request("GET", f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}", payload, headers)
27
+ res = conn.getresponse()
28
+ data = res.read()
29
+ decoded_data = data.decode("utf-8")
30
+ df = pd.read_csv(StringIO(decoded_data))
31
+ df = df.filter(like='value')
32
+ all_dataframes.append(df)
33
+ combined_data = pd.concat(all_dataframes, ignore_index=True)
34
+ combined_data.to_csv(f'{particle}_{today}.csv', index=False)
35
+ today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z"
36
+ yesterday = (date.today() - timedelta(days_yesterday)).isoformat() + "T09:00:00Z"
37
+
38
+ def delete_csv(csvs):
39
+ for csv in csvs:
40
+ if(os.path.exists(csv) and os.path.isfile(csv)):
41
+ os.remove(csv)
42
+
43
+ def clean_values():
44
+ particles = ["NO2", "O3"]
45
+ csvs = []
46
+ NO2 = []
47
+ O3 = []
48
+ today = date.today().isoformat() + "T09:00:00Z"
49
+ yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
50
+ latest_date = (date.today() - timedelta(7)).isoformat() + "T09:00:00Z"
51
+ days_today = 0
52
+ while(today != latest_date):
53
+ for particle in particles:
54
+ name = f'{particle}_{today}.csv'
55
+ csvs.append(name)
56
+ days_today += 1
57
+ today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z"
58
+ for csv_file in csvs:
59
+ values = [] # Reset values for each CSV file
60
+ # Open the CSV file and read the values
61
+ with open(csv_file, 'r') as file:
62
+ reader = csv.reader(file)
63
+ for row in reader:
64
+ for value in row:
65
+ # Use regular expressions to extract numeric part
66
+ cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", value)
67
+ if cleaned_value: # If we successfully extract a number
68
+ values.append(float(cleaned_value[0])) # Convert the first match to float
69
+
70
+ # Compute the average if the values list is not empty
71
+ if values:
72
+ avg = sum(values) / len(values)
73
+ if "NO2" in csv_file:
74
+ NO2.append(avg)
75
+ else:
76
+ O3.append(avg)
77
+
78
+ delete_csv(csvs)
79
+
80
+ return NO2, O3
81
+
82
+
83
+ def add_columns():
84
+ file_path = 'weather_data.csv'
85
+ df = pd.read_csv(file_path)
86
+
87
+ df.insert(1, 'NO2', None)
88
+ df.insert(2, 'O3', None)
89
+ df.insert(10, 'weekday', None)
90
+
91
+ df.to_csv('combined_data.csv', index=False)
92
+
93
+
94
+ def scale():
95
+ file_path = 'combined_data.csv'
96
+ df = pd.read_csv(file_path)
97
+ columns = list(df.columns)
98
+
99
+
100
+ columns.insert(3, columns.pop(6))
101
+
102
+ df = df[columns]
103
+
104
+ columns.insert(5, columns.pop(9))
105
+
106
+ df = df[columns]
107
+
108
+ columns.insert(9, columns.pop(6))
109
+
110
+ df = df[columns]
111
+
112
+ df = df.rename(columns={
113
+ 'datetime':'date',
114
+ 'windspeed': 'wind_speed',
115
+ 'temp': 'mean_temp',
116
+ 'solarradiation':'global_radiation',
117
+ 'precip':'percipitation',
118
+ 'sealevelpressure':'pressure',
119
+ 'visibility':'minimum_visibility'
120
+ })
121
+
122
+ df['date'] = pd.to_datetime(df['date'])
123
+ df['weekday'] = df['date'].dt.day_name()
124
+
125
+
126
+ df['wind_speed'] = (df['wind_speed'] / 3.6) * 10
127
+ df['mean_temp'] = df['mean_temp'] * 10
128
+ df['minimum_visibility'] = df['minimum_visibility'] * 10
129
+ df['percipitation'] = df['percipitation'] * 10
130
+ df['pressure'] = df['pressure'] * 10
131
+
132
+ df['wind_speed'] = df['wind_speed'].astype(int)
133
+ df['mean_temp'] = df['mean_temp'].astype(int)
134
+ df['minimum_visibility'] = df['minimum_visibility'].astype(int)
135
+ df['percipitation'] = df['percipitation'].astype(int)
136
+ df['pressure'] = df['pressure'].astype(int)
137
+ df['humidity'] = df['humidity'].astype(int)
138
+ df['global_radiation'] = df['global_radiation'].astype(int)
139
+
140
+ df.to_csv('recorded_data.csv', index=False)
141
+
142
+ def insert_pollution(NO2, O3):
143
+ file_path = 'recorded_data.csv'
144
+ df = pd.read_csv(file_path)
145
+ start_index = 0
146
+ while NO2:
147
+ df.loc[start_index, 'NO2'] = NO2.pop()
148
+ start_index += 1
149
+ start_index = 0
150
+ while O3:
151
+ df.loc[start_index, 'O3'] = O3.pop()
152
+ start_index += 1
153
+ df.to_csv('recorded_data.csv', index=False)
154
+
155
+ api_call()
156
+ NO2, O3 = clean_values()
157
+ add_columns()
158
+ scale()
159
+ insert_pollution(NO2, O3)
160
+ os.remove('combined_data.csv')
161
+ os.remove('weather_data.csv')
data_loading.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+
5
+ def create_lag_features_for_single_day(data, random_index, lag_days):
6
+ lag_features = [
7
+ column
8
+ for column in data.columns
9
+ if column
10
+ in [
11
+ "O3",
12
+ "NO2",
13
+ "wind_speed",
14
+ "mean_temp",
15
+ "global_radiation",
16
+ "percipitation",
17
+ "pressure",
18
+ "minimum_visibility",
19
+ "humidity",
20
+ ]
21
+ ]
22
+ lagged_data = {}
23
+ for feature in lag_features:
24
+ for lag in range(1, lag_days + 1):
25
+ try:
26
+ lagged_value = data.loc[random_index - lag, feature]
27
+ lagged_data[f"{feature}_lag_{lag}"] = lagged_value
28
+ except IndexError:
29
+ print(
30
+ f"Value not found for feature {feature} lagged by {lag} from day {random_index}"
31
+ )
32
+ continue
33
+
34
+ # Add together lagged features, non-lagged features and date
35
+ current_data = data.iloc[random_index].to_dict()
36
+ current_data.update(lagged_data)
37
+ return pd.DataFrame([current_data])
38
+
39
+
40
+ def create_targets_for_single_day(data, random_index, target_column, days_ahead):
41
+ targets = {}
42
+ for day in range(1, days_ahead + 1):
43
+ future_index = random_index + day
44
+ try:
45
+ targets[f"{target_column}_{day}_days_ahead"] = data.loc[
46
+ future_index, target_column
47
+ ]
48
+ except IndexError:
49
+ print(
50
+ f"Value not found for particle {target_column} forwarded by {day} day"
51
+ )
52
+
53
+ return pd.DataFrame([targets])
54
+
55
+
56
+ def load_data_batch(data, target_particle, lag_days):
57
+ data["date"] = pd.to_datetime(data["date"])
58
+
59
+ # Exclude period with missing O3 data + buffer before and after for targets and lag features
60
+ start_exclusion = pd.to_datetime("2022-01-01") - pd.Timedelta(days=3)
61
+ end_exclusion = pd.to_datetime("2022-04-27") + pd.Timedelta(days=lag_days)
62
+ valid_data = data[
63
+ ~((data["date"] >= start_exclusion) & (data["date"] <= end_exclusion))
64
+ ]
65
+ valid_data = valid_data[
66
+ lag_days:-3
67
+ ] # also exclude first seven and last three days of the dataset
68
+
69
+ # Get random day in the valid data
70
+ random_index = np.random.choice(valid_data.index, 1)[0]
71
+
72
+ # Create lag features for the selected day
73
+ train_data = create_lag_features_for_single_day(data, random_index, lag_days)
74
+ targets = create_targets_for_single_day(
75
+ data, random_index, target_particle, days_ahead=3
76
+ )
77
+
78
+ return train_data, targets
79
+
80
+
81
+ def create_features_and_targets(
82
+ data,
83
+ target_particle, # Added this parameter
84
+ lag_days=7,
85
+ sma_days=7,
86
+ days_ahead=3,
87
+ ):
88
+ """
89
+ Creates lagged features, SMA features, last year's particle data (NO2 and O3) for specific days,
90
+ sine and cosine transformations for 'weekday' and 'month', and target variables for the specified
91
+ particle ('O3' or 'NO2') for the next 'days_ahead' days. Scales features and targets without
92
+ disregarding outliers and saves the scalers for inverse scaling. Splits the data into train,
93
+ validation, and test sets using the most recent dates. Prints the number of rows with missing
94
+ values dropped from the dataset.
95
+
96
+ Parameters:
97
+ - data (pd.DataFrame): The input time-series dataset.
98
+ - target_particle (str): The target particle ('O3' or 'NO2') for which targets are created.
99
+ - lag_days (int): Number of lag days to create features for (default 7).
100
+ - sma_days (int): Window size for Simple Moving Average (default 7).
101
+ - days_ahead (int): Number of days ahead to create target variables for (default 3).
102
+
103
+ Returns:
104
+ - X_train_scaled (pd.DataFrame): Scaled training features.
105
+ - y_train_scaled (pd.DataFrame): Scaled training targets.
106
+ - X_val_scaled (pd.DataFrame): Scaled validation features (365 days).
107
+ - y_val_scaled (pd.DataFrame): Scaled validation targets (365 days).
108
+ - X_test_scaled (pd.DataFrame): Scaled test features (365 days).
109
+ - y_test_scaled (pd.DataFrame): Scaled test targets (365 days).
110
+ """
111
+ import warnings
112
+
113
+ import joblib
114
+ import numpy as np
115
+ import pandas as pd
116
+ from sklearn.preprocessing import StandardScaler
117
+
118
+ warnings.filterwarnings("ignore")
119
+
120
+ lag_features = [
121
+ "NO2",
122
+ "O3",
123
+ "wind_speed",
124
+ "mean_temp",
125
+ "global_radiation",
126
+ "minimum_visibility",
127
+ "humidity",
128
+ ]
129
+ if target_particle == "NO2":
130
+ lag_features = lag_features + ["percipitation", "pressure"]
131
+
132
+ if target_particle not in ["O3", "NO2"]:
133
+ raise ValueError("target_particle must be 'O3' or 'NO2'")
134
+
135
+ data = data.copy()
136
+ data["date"] = pd.to_datetime(data["date"])
137
+ data = data.sort_values("date").reset_index(drop=True)
138
+
139
+ # Extract 'weekday' and 'month' from 'date' if not present
140
+ if "weekday" not in data.columns or data["weekday"].dtype == object:
141
+ data["weekday"] = data["date"].dt.weekday # Monday=0, Sunday=6
142
+ if "month" not in data.columns:
143
+ data["month"] = data["date"].dt.month # 1 to 12
144
+
145
+ # Create sine and cosine transformations for 'weekday' and 'month'
146
+ data["weekday_sin"] = np.sin(2 * np.pi * data["weekday"] / 7)
147
+ data["weekday_cos"] = np.cos(2 * np.pi * data["weekday"] / 7)
148
+ data["month_sin"] = np.sin(
149
+ 2 * np.pi * (data["month"] - 1) / 12
150
+ ) # Adjust month to 0-11
151
+ data["month_cos"] = np.cos(2 * np.pi * (data["month"] - 1) / 12)
152
+
153
+ # Create lagged features for the specified lag days
154
+ for feature in lag_features:
155
+ for lag in range(1, lag_days + 1):
156
+ data[f"{feature}_lag_{lag}"] = data[feature].shift(lag)
157
+
158
+ # Create SMA features
159
+ for feature in lag_features:
160
+ data[f"{feature}_sma_{sma_days}"] = (
161
+ data[feature].rolling(window=sma_days).mean()
162
+ )
163
+
164
+ # Create particle data (NO2 and O3) from the same time last year
165
+ # Today last year
166
+ data["O3_last_year"] = data["O3"].shift(365)
167
+ data["NO2_last_year"] = data["NO2"].shift(365)
168
+
169
+ # 7 days before today last year
170
+ for i in range(1, lag_days + 1):
171
+ data[f"O3_last_year_{i}_days_before"] = data["O3"].shift(365 + i)
172
+ data[f"NO2_last_year_{i}_days_before"] = data["NO2"].shift(365 + i)
173
+
174
+ # 3 days after today last year
175
+ data["O3_last_year_3_days_after"] = data["O3"].shift(365 - 3)
176
+ data["NO2_last_year_3_days_after"] = data["NO2"].shift(365 - 3)
177
+
178
+ # Create targets only for the specified particle for the next 'days_ahead' days
179
+ for day in range(1, days_ahead + 1):
180
+ data[f"{target_particle}_plus_{day}_day"] = data[target_particle].shift(-day)
181
+
182
+ # Calculate the number of rows before dropping missing values
183
+ rows_before = data.shape[0]
184
+
185
+ # Drop missing values
186
+ data = data.dropna().reset_index(drop=True)
187
+
188
+ # Calculate the number of rows after dropping missing values
189
+ rows_after = data.shape[0]
190
+
191
+ # Calculate and print the number of rows dropped
192
+ rows_dropped = rows_before - rows_after
193
+ print(f"Number of rows with missing values dropped: {rows_dropped}")
194
+
195
+ # Now, split data into train, validation, and test sets using the most recent dates
196
+ total_days = data.shape[0]
197
+ test_size = 365
198
+ val_size = 365
199
+
200
+ if total_days < test_size + val_size:
201
+ raise ValueError(
202
+ "Not enough data to create validation and test sets of 365 days each."
203
+ )
204
+
205
+ # Ensure the data is sorted by date in ascending order
206
+ data = data.sort_values("date").reset_index(drop=True)
207
+
208
+ # Split data
209
+ train_data = data.iloc[: -(val_size + test_size)]
210
+ val_data = data.iloc[-(val_size + test_size) : -test_size]
211
+ test_data = data.iloc[-test_size:]
212
+
213
+ # Define target columns for the specified particle
214
+ target_cols = [
215
+ f"{target_particle}_plus_{day}_day" for day in range(1, days_ahead + 1)
216
+ ]
217
+
218
+ # Define feature columns
219
+ exclude_cols = ["date", "weekday", "month"] + target_cols
220
+ feature_cols = [col for col in data.columns if col not in exclude_cols]
221
+
222
+ # Split features and targets
223
+ X_train = train_data[feature_cols]
224
+ y_train = train_data[target_cols]
225
+
226
+ X_val = val_data[feature_cols]
227
+ y_val = val_data[target_cols]
228
+
229
+ X_test = test_data[feature_cols]
230
+ y_test = test_data[target_cols]
231
+
232
+ # Initialize scalers
233
+ feature_scaler = StandardScaler()
234
+ target_scaler = StandardScaler()
235
+
236
+ # Fit the scalers on the training data
237
+ X_train_scaled = feature_scaler.fit_transform(X_train)
238
+ y_train_scaled = target_scaler.fit_transform(y_train)
239
+
240
+ # Apply the scalers to validation and test data
241
+ X_val_scaled = feature_scaler.transform(X_val)
242
+ y_val_scaled = target_scaler.transform(y_val)
243
+
244
+ X_test_scaled = feature_scaler.transform(X_test)
245
+ y_test_scaled = target_scaler.transform(y_test)
246
+
247
+ # Convert scaled data back to DataFrame for consistency
248
+ X_train_scaled = pd.DataFrame(
249
+ X_train_scaled, columns=feature_cols, index=X_train.index
250
+ )
251
+ y_train_scaled = pd.DataFrame(
252
+ y_train_scaled, columns=target_cols, index=y_train.index
253
+ )
254
+
255
+ X_val_scaled = pd.DataFrame(X_val_scaled, columns=feature_cols, index=X_val.index)
256
+ y_val_scaled = pd.DataFrame(y_val_scaled, columns=target_cols, index=y_val.index)
257
+
258
+ X_test_scaled = pd.DataFrame(
259
+ X_test_scaled, columns=feature_cols, index=X_test.index
260
+ )
261
+ y_test_scaled = pd.DataFrame(y_test_scaled, columns=target_cols, index=y_test.index)
262
+
263
+ # Save the scalers to files
264
+ joblib.dump(feature_scaler, "feature_scaler.joblib")
265
+ # Save the target scaler with the particle name to distinguish
266
+ target_scaler_filename = f"target_scaler_{target_particle}.joblib"
267
+ joblib.dump(target_scaler, target_scaler_filename)
268
+
269
+ return (
270
+ X_train_scaled,
271
+ y_train_scaled,
272
+ X_val_scaled,
273
+ y_val_scaled,
274
+ X_test_scaled,
275
+ y_test_scaled,
276
+ )