Forecasting hurricane movements - A machine learning approach
Felix Kroeber - Introduction to data science & machine learning (winter term 2021/22)
# general setup
import geopandas as gpd
import matplotlib.pyplot as plt
import multiprocess
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import warnings
from datetime import datetime, timedelta
from dtreeviz.trees import dtreeviz
from geographiclib.geodesic import Geodesic
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import Image, HTML
from ipywidgets import widgets, Layout
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn import tree
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RandomizedSearchCV
%matplotlib inline
InteractiveShell.ast_node_interactivity = "all"
plt.rcParams['figure.figsize'] = [12, 7.5]
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.precision', 3)
pd.options.mode.chained_assignment = None
# read hurricane data set
workdir = "C:\\Users\\felix\\OneDrive\\Studium_Master\\electives\\Intro_to_ML\\end_of_term"
hurricanes = pd.read_csv(os.path.join(workdir, "atlantic.csv"))
hurricanes.head()
hurricanes.tail()
hurricanes.info()
# transform lat/long into numeric values
def coord_converter(coord_str):
if any(x in coord_str for x in ["N", "E"]):
return float(coord_str[:-1])
if any(x in coord_str for x in ["S", "W"]):
return float("-" + coord_str[:-1])
hurricanes["Longitude"] = hurricanes["Longitude"].apply(coord_converter)
hurricanes["Latitude"] = hurricanes["Latitude"].apply(coord_converter)
# transform date and time into datetime
hurricanes['Date'] = [str(date) for date in hurricanes['Date']]
hurricanes['Time'] = [str(date) for date in hurricanes['Time']]
hurricanes['Time'] = hurricanes['Time'].str.pad(width=4, fillchar='0')
hurricanes.insert(2,
"Datetime",
[datetime.strptime(dt[0]+dt[1], "%Y%m%d%H%M") for dt in zip(hurricanes['Date'], hurricanes['Time'])])
hurricanes.drop(columns=['Date', 'Time'], inplace=True)
# re-transform into int objects
hurricanes.insert(3, "Year", [x.year for x in hurricanes.Datetime])
hurricanes.insert(4, "Month", [x.month for x in hurricanes.Datetime])
hurricanes.insert(5, "Day", [x.day for x in hurricanes.Datetime])
hurricanes.insert(6, "Hour", [x.hour for x in hurricanes.Datetime])
# remove white spaces in front of names
hurricanes["Name"] = hurricanes["Name"].str.lstrip()
# encode NaNs
hurricanes.replace(["", -99, -999], np.nan, inplace=True)
hurricanes.replace(r'^\s*$', np.nan, regex=True, inplace=True)
# remove duplicates
hurricanes.drop_duplicates(inplace=True)
On average, there are about 29 records per hurricane. The longest track record of a single hurricane dates back to 1899 and contains 133 entries.
Nine different statuses are used to characterise the condition of a hurricane. The meaning of the abbreviations can be taken from the data documentation (the most frequent abbreviation TS, for example, describes the existence of a tropical cyclone of tropical storm intensity (34-63 knots)). Basically, the status as a categorical variable describes a combination of information given by the latitude and the wind intensity variable.
There are also nine different values for the event variable, which encodes events such as landfall (L). Note that most events contain information which can only be determined posthoc (e.g. intensity peak or minimum in central pressure). Since only special events are described, values for the event variable exist only in slightly fewer than 1000 cases.
The main part of the data set describes hurricanes in the 20th century. About half of all entries refer to the 100-year period 1850-1950, while the other half of the entries were recorded in the following 65 years.
All locations where hurricanes have been recorded are in the northern hemisphere, with a large proportion in the tropical-subtropical region. With regard to longitude, there are obviously some erroneous entries. The minimum degree entry does not exist and the maximum entry of 63 degrees is highly improbable due to its distance from the Atlantic Ocean.
While there are entries for the maximum wind speed for almost all data points, the minimum pressure in the hurricane system was only recorded for slightly more than 1/3 of the entries. Both variables are inconspicuous with regard to their values, the described minimum and maximum values appear plausible.
All variables describing the wind radii have values for only about 5900 entries. Including these variables as features in the following analyses would therefore mean to exclude a large part of all data.
# print basic summary stats for all vars
hurricanes.describe(include=object)
hurricanes.describe(datetime_is_numeric=True)
The findings so far suggest the following data processing steps to be sensible. All these steps effectively reduce the initial data set to data points for which values for the variables of interest are available.
# keep only most recent records due to larger inaccuracies in entries from early mid-century
hurricanes = hurricanes[hurricanes["Datetime"] > datetime.strptime("19500101", "%Y%d%m")]
# only keep entries for which at least 5 data points are available
long_track_hurricanes = hurricanes['ID'].value_counts()[hurricanes['ID'].value_counts() >= 5]
hurricanes = hurricanes[hurricanes['ID'].isin(long_track_hurricanes.index.tolist())]
# only retain entries for which pressure is available
hurricanes = hurricanes[~hurricanes['Minimum Pressure'].isna()]
# remove wind radii vars with lot of NaN values
hurricanes = hurricanes.loc[:,:"Minimum Pressure"]
# remove events as they are not useful
hurricanes.drop(columns="Event", inplace=True)
# Plotting time intervals between records
hurricanes_grouped = hurricanes.groupby("ID")
res = []
for name, group in hurricanes_grouped:
delta_t = (group["Datetime"]-group["Datetime"].shift()).iloc[1:]
res.append(delta_t)
res = [item for sublist in res for item in sublist]
res = [x/pd.Timedelta(minutes=60) for x in res]
plt.hist(res, bins=30, range=(0,30))
plt.ylabel('Frequency')
plt.xlabel('Timeinterval (h)');
# remove all entries not corresponding to the synoptic times 00,06,12,18
syn_times_idx = [dt.strftime("%H%M") in ["0000", "0600", "1200", "1800"] for dt in hurricanes["Datetime"]]
hurricanes = hurricanes[syn_times_idx]
# analyse temporal patterns in interval times
hurricanes_grouped = hurricanes.groupby("ID")
hurricanes["Time interval"] = hurricanes_grouped["Datetime"].transform(lambda x: x - x.shift())
_hurricanes = hurricanes.dropna(subset=["Time interval"])
# plotting time intervals between records
hurricanes_grouped = _hurricanes.groupby([x for x in 5 * np.floor(_hurricanes["Year"]/5)])
hurricanes_vis = pd.DataFrame({
"n_data_points" : hurricanes_grouped["ID"].count(),
"p_long_intervals" : hurricanes_grouped["Time interval"].agg(lambda x: sum(x != pd.Timedelta(hours=6))/len(x))
})
fig, ax1 = plt.subplots()
# plotting number of data points on left axis
ax1.bar(hurricanes_vis.index, "n_data_points", data = hurricanes_vis, width = 1.5)
ax1.set_ylabel("number of data points")
ax1.set_xticks(ticks = [*[x-2.5 for x in hurricanes_vis.index], hurricanes_vis.index[-1]+2.5],
labels = [int(x) for x in [*hurricanes_vis.index, hurricanes_vis.index[-1]+5]],
rotation = 45)
ax1.set_ylim(0)
# plotting percentage of 6 hour intervals on right axis
ax2 = ax1.twinx()
ax2.plot(hurricanes_vis.index, hurricanes_vis.p_long_intervals, color="darkred")
ax2.set_ylabel("\n percentage of intervals > 6 hours")
ax2.set_ylim(0, 1);
# keep only records from 1980 onwards & remove later ones with gaps larger than 6 hours
hurricanes = hurricanes[hurricanes["Datetime"] > datetime.strptime("19800101", "%Y%d%m")]
rm_ids = hurricanes[[x > pd.Timedelta(hours=6) for x in hurricanes["Time interval"]]]["ID"]
hurricanes = hurricanes[~hurricanes["ID"].isin(rm_ids)]
print(f"{len(rm_ids)} hurricane tracks removed due to time gaps larger than 6 hours")
# interactive plotting of trajectories
warnings.filterwarnings("ignore")
# create user interface inputs
years = widgets.IntRangeSlider(
value=[1900, 2100],
min=min([x.year for x in hurricanes["Datetime"]]),
max=max([x.year for x in hurricanes["Datetime"]]),
step=1,
description='Years (inclusive):',
style = {'description_width': 'initial'},
layout=Layout(width='50%')
)
months_dt = [datetime.strptime(str(i), "%m") for i in range(1,13)]
options = [(i.strftime('%b'), i) for i in months_dt]
months = widgets.SelectionRangeSlider(
options=options,
index=(0,11),
description='Months (inclusive):',
style = {'description_width': 'initial'},
layout=Layout(width='40%')
)
hurricane_ids = list(hurricanes.ID.unique())
id_select = widgets.SelectMultiple(
options=hurricane_ids,
value=hurricane_ids,
description='Id select:',
rows=10,
style = {'description_width': 'initial'}
)
popup_vars = widgets.SelectMultiple(
options=list(hurricanes.select_dtypes(exclude=['datetime', 'timedelta']).columns),
value=["Maximum Wind", "Minimum Pressure"],
description='Popup variables:',
rows=10,
style = {'description_width': 'initial'}
)
numeric_vars = list(hurricanes.select_dtypes(include=['number']).columns)
numeric_vars.insert(0,None)
categorial_vars = list(hurricanes.select_dtypes(include=['object']).columns)
size_var = widgets.Dropdown(
options=numeric_vars,
description='Marker size:',
style = {'description_width': 'initial'}
)
color_var_numerical = widgets.Dropdown(
options=numeric_vars,
description='Marker color 1st map:',
value="Maximum Wind",
style = {'description_width': 'initial'}
)
color_var_categorial = widgets.Dropdown(
options=categorial_vars,
description='Marker color 2nd map:',
value="Name",
style = {'description_width': 'initial'}
)
basemap = widgets.RadioButtons(
options=['osm_base', 'osm_topographic'],
value='osm_base'
)
out_info = widgets.Output()
# create function to define two interlinked maps with their initial settings
# two maps only differ in their vis settings w.r.t. marker color
# two maps need to be defined due to different layer stack composition (numeric vs. categorial vis var)
def hurrican_map(color_var):
fig = go.FigureWidget(data = px.scatter_mapbox(
data_frame = hurricanes,
lat="Latitude",
lon="Longitude",
color=color_var,
hover_data={"Latitude": False,
"Longitude": False,
"Maximum Wind": True,
"Minimum Pressure": True},
width=975,
height=700,
center=go.layout.mapbox.Center(lat=47.5, lon=-60),
zoom=1.75))
fig = fig.update_layout(mapbox_style="open-street-map")
return fig
map_I = hurrican_map("Maximum Wind")
map_II = hurrican_map("Name")
# define update (backend) procedures
class map_update:
# initialisation procedure - set df & map to apply updates
def __init__(self, map):
self.df = hurricanes
self.map = map
def df_update(self):
# get selection of years
start_year = str(years.value[0]) + "0101"
end_year = str(years.value[1]+1) + "0101"
# get selection of months
start_month = months.value[0].month
end_month = months.value[1].month+1
# apply temporal subsetting
self.df = self.df[self.df["Datetime"].between(datetime.strptime(start_year, "%Y%d%m"),
datetime.strptime(end_year, "%Y%d%m"))]
self.df = self.df[[x.month in range(start_month, end_month) for x in self.df["Datetime"]]]
# apply id subsetting
self.df = self.df[[x in id_select.value for x in self.df["ID"]]]
def map_vis_update(self, color_var):
# clear previous outputs
out_info.clear_output()
# get columns to display in popups
hover_vars = {}
for col in list(self.df.select_dtypes(exclude=['datetime', 'timedelta']).columns):
if col in popup_vars.value:
hover_vars[col] = True
else:
hover_vars[col] = False
# get size var
if size_var.value != None:
marker_size_var = size_var.value
else:
marker_size_var = None
# configure fig based on selection
if len(self.df) != 0:
self.map_new = go.FigureWidget(data = px.scatter_mapbox(
data_frame = self.df,
lat="Latitude",
lon="Longitude",
size = marker_size_var,
color = color_var.value,
hover_data=hover_vars))
self.map_new = self.map_new.update_layout(
{'coloraxis': {'colorbar': {'title': {'text': color_var.value}}}}
)
else:
with out_info:
print("Selection obtained no data points. No map update was done. Reload of widget necessary!")
# update basemap
if basemap.value == "osm_base":
self.map_new = self.map_new.update_layout(
mapbox_style="open-street-map",
mapbox_layers=[
{
"below": 'traces',
"sourcetype": "raster",
"sourceattribution": "© OpenStreetMap contributors",
"source": [
"https://tile.openstreetmap.org/{z}/{x}/{y}.png"
]
}
])
else:
self.map_new = self.map_new.update_layout(
mapbox_style="white-bg",
mapbox_layers=[
{
"below": 'traces',
"sourcetype": "raster",
"sourceattribution": "© OpenStreetMap contributors",
"source": [
"https://tile.opentopomap.org/{z}/{x}/{y}.png"
]
}
])
# update old map using new properties
self.map.update(data = self.map_new.data,
layout = {'coloraxis': self.map_new.layout.coloraxis,
'mapbox': {'layers': self.map_new.layout.mapbox.layers,
'style': self.map_new.layout.mapbox.style}})
def response_map_I(change):
map_I_updater = map_update(map_I)
map_I_updater.df_update()
map_I_updater.map_vis_update(color_var_numerical)
def response_map_II(change):
map_II_updater = map_update(map_II)
map_II_updater.df_update()
map_II_updater.map_vis_update(color_var_categorial)
# observe changes
for response in [response_map_I, response_map_II]:
years.observe(response, names="value")
months.observe(response, names="value")
id_select.observe(response, names="value")
popup_vars.observe(response, names="value")
size_var.observe(response, names="value")
basemap.observe(response, names="value")
color_var_numerical.observe(response_map_I, names="value")
color_var_categorial.observe(response_map_II, names="value")
# configure layout & display figure
accordion = widgets.Accordion([widgets.VBox([years, months]),
widgets.VBox([id_select]),
widgets.HBox([widgets.VBox([popup_vars], layout=Layout(padding='0px 25px 0px 0px')),
widgets.VBox([size_var, color_var_numerical, color_var_categorial])]),
basemap])
accordion.set_title(0, 'Temporal Subsetting')
accordion.set_title(1, 'ID Subsetting')
accordion.set_title(2, 'Visualisation Settings')
accordion.set_title(3, 'Basemap Settings')
widgets.VBox([accordion, out_info, map_I, map_II])
# reset warnings to default
warnings.filterwarnings("default")