Data Wrangling COVID-19 Mini-Challenge

Imports/Setup

1 import pandas as pd
2 import wikidata_plain_sparql as wikidata
3
4 from bokeh.palettes import turbo
5 from bokeh.plotting import figure, output_notebook, show, gridplot
6 from bokeh.models import ColumnDataSource, HoverTool
7
8 # set bokeh output mode to notebook
9 output_notebook()
10
11 from helper import get_jhu_cached, create_grid, get_bokeh_tools

Loading BokehJS ...

Worldwide COVID-19 Data

1 # get all dates from earliest available data until yesterday
2 all_dates = pd.date_range(start='2020-01-22', end='today')[:-1]
3
4 # standardize column names for all entries
5 def rename_columns(column):
6 column_map = {
7 'Lat': 'Latitude',
8 'Long_': 'Longitude',
9 'Incidence_Rate': 'Incident_Rate'
10 }
11 if column in column_map:
12 return column_map[column]
13 return column.replace('/', '_').replace('-', '_').replace(' ', '_')
14
15 # load data from all dates
16 all_data = []
17 for date in all_dates:
18 data = pd.read_csv(get_jhu_cached(date))
19 data.rename(columns=rename_columns, inplace=True)
20 data['Date'] = date
21 all_data.append(data)
22
23 # combine data from all days
24 raw_data = pd.concat(all_data)
1 # standardize country names for all entries
2 country_mapping = {
3 'MS Zaandam|Diamond Princess|Cruise Ship': 'Others', # move cruise ships to others
4 'Hong Kong.+': 'Hong Kong',
5 'Iran.+': 'Iran',
6 '.*Congo.*': 'Congo',
7 'Mainland China': 'China',
8 '.*Bahamas.*': 'The Bahamas',
9 '.*Gambia.*': 'The Gambia',
10 'Viet Nam': 'Vietnam',
11 'Taiwan\*': 'Taiwan',
12 'Cote d\'Ivoire': 'Ivory Coast',
13 'Cabo Verde': 'Cape Verde',
14 'Russian Federation': 'Russia',
15 ' Azerbaijan': 'Azerbaijan',
16 'Holy See': 'Vatican City',
17 'Republic of Ireland': 'Ireland',
18 'Republic of Moldova': 'Moldova',
19 'Czechia': 'Czech Republic',
20 'Republic of Korea|Korea, South': 'South Korea',
21 'Timor-Leste': 'East Timor',
22 'Macao SAR|Macau': 'Macao',
23 'UK': 'United Kingdom',
24 'Jersey|Guernsey': 'Channel Islands',
25 'Dominican Republicn Republic|Dominica': 'Dominican Republic'
26 }
27
28 raw_data['Country_Region'].replace(to_replace=country_mapping.keys(), value=country_mapping.values(), regex=True, inplace=True)
1 # group data by country
2 daily_updates = raw_data.groupby(['Country_Region', 'Date']).agg(
3 Confirmed = ('Confirmed','sum'),
4 Deaths = ('Deaths','sum'),
5 ).reset_index()
6
7 # get changes in data
8 updates_per_country = daily_updates.groupby('Country_Region')
9 daily_updates['New_Confirmed'] = updates_per_country['Confirmed'].diff().fillna(0)
10 daily_updates['New_Deaths'] = updates_per_country['Deaths'].diff().fillna(0)
11
12 worldwide_pretty = daily_updates.loc[:, ['Date', 'Country_Region', 'New_Confirmed', 'Confirmed', 'New_Deaths', 'Deaths']]
1 # get all countries incl. colors
2 all_countries = daily_updates['Country_Region'].sort_values().unique()
3 colors = turbo(all_countries.size)
1 # show new cases
2 tools = get_bokeh_tools(tooltips=[
3 ('Country', '@country'),
4 ('Date', '@x{%F}'),
5 ('New Cases', '@y{0,0}'),
6 ])
7
8 new_cases_graph = figure(
9 title='New COVID-19 cases per country',
10 y_axis_label='new cases',
11 x_axis_type='datetime',
12 sizing_mode='stretch_width',
13 tools=tools)
14
15 i = 0
16 for country in all_countries:
17 updates_for_country = daily_updates.loc[daily_updates['Country_Region'] == country]
18 data = ColumnDataSource(data={
19 'x': updates_for_country['Date'],
20 'y': updates_for_country['New_Confirmed'],
21 'country': updates_for_country['Country_Region']
22 })
23 new_cases_graph.line('x', 'y', source=data, line_color=colors[i], line_width=2)
24 i += 1
25 show(new_cases_graph)

1 # show total cases
2 tools = get_bokeh_tools(tooltips=[
3 ('Country', '@country'),
4 ('Date', '@x{%F}'),
5 ('Total Cases', '@y{0,0}'),
6 ])
7
8 total_cases_graph = figure(
9 title='Total COVID-19 cases per country',
10 y_axis_label='total cases',
11 x_axis_type='datetime',
12 sizing_mode='stretch_width',
13 tools=tools)
14
15 i = 0
16 for country in all_countries:
17 updates_for_country = daily_updates.loc[daily_updates['Country_Region'] == country]
18 data = ColumnDataSource(data={
19 'x': updates_for_country['Date'],
20 'y': updates_for_country['Confirmed'],
21 'country': updates_for_country['Country_Region']
22 })
23 total_cases_graph.line('x', 'y', source=data, line_color=colors[i], line_width=2)
24 i += 1
25 show(total_cases_graph)

Swiss COVID-19 Data

1 # get population data from WikiData
2 canton_data = wikidata.query('''
3 SELECT ?shortCode ?population ?canton WHERE {
4 ?canton wdt:P31 wd:Q23058.
5 ?canton wdt:P300 ?shortCode.
6 OPTIONAL {
7 ?canton p:P1082 ?population_stmt.
8 ?population_stmt ps:P1082 ?population.
9 ?population_stmt pq:P585 ?population_date.
10 }
11 FILTER NOT EXISTS {
12 ?canton p:P1082/pq:P585 ?population_date_.
13 FILTER (?population_date_ > ?population_date)
14 }
15 }
16 ORDER BY ?shortCode
17 ''')
18 canton_data.set_index('shortCode', inplace=True)
1 raw_data = pd.read_csv('https://raw.githubusercontent.com/openZH/covid_19/master/COVID19_Fallzahlen_CH_total_v2.csv')
2
3 # convert to date
4 raw_data['date'] = pd.to_datetime(raw_data['date'])
5
6 # remove FL
7 swiss = raw_data[raw_data['abbreviation_canton_and_fl'] != 'FL']
8
9 # only use data after 1st of june
10 swiss = swiss[swiss['date'] >= '2020-06-01']
11
12 # only keep useful entries
13 conf_cases = swiss.loc[-swiss['ncumul_conf'].isna()].copy()
14
15 # get all cantons
16 all_cantons = conf_cases['abbreviation_canton_and_fl'].unique()
17 all_cantons.sort()
18
19 # calculate new cases/deaths
20 conf_cases_per_canton = conf_cases.groupby('abbreviation_canton_and_fl')
21 conf_cases['new_cases'] = conf_cases_per_canton['ncumul_conf'].diff().fillna(0)
22 conf_cases['new_deaths'] = conf_cases_per_canton['ncumul_deceased'].diff().fillna(0)
23
24 # calculate total cases/deaths
25 conf_cases['total_cases'] = conf_cases_per_canton['new_cases'].cumsum()
26 conf_cases['total_deaths'] = conf_cases_per_canton['new_deaths'].cumsum()
27
28 # calculate values per 100'000 residents
29 def relative_values(group):
30 canton = group.name
31 return group.div(int(canton_data.at['CH-' + canton, 'population']) / 100000)
32
33 conf_cases['new_cases_relative'] = conf_cases_per_canton['new_cases'].apply(relative_values)
34 conf_cases['total_cases_relative'] = conf_cases_per_canton['total_cases'].apply(relative_values)
35
36 swiss_pretty = conf_cases.loc[:, ['date', 'abbreviation_canton_and_fl', 'new_cases', 'total_cases', 'new_deaths', 'total_deaths']]

New COVID-19 cases per 100'0000 residents

1 # calculate new cases
2 graphs = []
3 max_new_cases = conf_cases['new_cases_relative'].max()
4 for canton in all_cantons:
5 update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton].copy()
6
7 update_for_canton['new_cases_relative_avg'] = update_for_canton['new_cases_relative'].rolling(window=7).mean()
8
9 new_cases_graph = figure(title=canton, y_axis_label='new cases', y_range=[0, max_new_cases], x_axis_type='datetime')
10 new_cases_graph.line(update_for_canton['date'], update_for_canton['new_cases_relative'], line_width=1)
11 new_cases_graph.line(update_for_canton['date'], update_for_canton['new_cases_relative_avg'], line_color='red', line_width=1)
12 graphs.append(new_cases_graph)
13
14 show(create_grid(graphs, sizing_mode='scale_width'))

Total COVID-19 cases since 1st of june per 100'000 residents

1 # calculate total cases
2 graphs = []
3 max_total_cases = conf_cases['total_cases_relative'].max()
4 for canton in all_cantons:
5 update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
6
7 total_cases_graph = figure(title=canton, y_axis_label='total cases', y_range=[0, max_total_cases], x_axis_type='datetime')
8 total_cases_graph.line(update_for_canton['date'], update_for_canton['total_cases_relative'], line_width=1)
9 graphs.append(total_cases_graph)
10
11 show(create_grid(graphs, sizing_mode='scale_width'))

Final Data Frames

1 worldwide_pretty

Date Country_Region New_Confirmed Confirmed New_Deaths Deaths
0 2020-02-24 Afghanistan 0.0 5.0 0.0 0.0
1 2020-02-25 Afghanistan 0.0 5.0 0.0 0.0
2 2020-02-26 Afghanistan 0.0 5.0 0.0 0.0
3 2020-02-27 Afghanistan 0.0 5.0 0.0 0.0
4 2020-02-28 Afghanistan 0.0 5.0 0.0 0.0
... ... ... ... ... ... ...
119753 2020-03-12 occupied Palestinian territory 0.0 0.0 0.0 0.0
119754 2020-03-14 occupied Palestinian territory 0.0 0.0 0.0 0.0
119755 2020-03-15 occupied Palestinian territory 0.0 0.0 0.0 0.0
119756 2020-03-16 occupied Palestinian territory 0.0 0.0 0.0 0.0
119757 2020-03-17 occupied Palestinian territory 0.0 0.0 0.0 0.0

119758 rows × 6 columns

1 swiss_pretty

date abbreviation_canton_and_fl new_cases total_cases new_deaths total_deaths
2371 2020-06-01 BL 0.0 0.0 0.0 0.0
2372 2020-06-01 FR 0.0 0.0 0.0 0.0
2373 2020-06-01 GR 0.0 0.0 0.0 0.0
2374 2020-06-01 JU 0.0 0.0 0.0 0.0
2375 2020-06-01 NE 0.0 0.0 0.0 0.0
... ... ... ... ... ... ...
15584 2021-11-15 TG 1.0 28303.0 0.0 486.0
15585 2021-11-15 UR 80.0 4345.0 0.0 47.0
15586 2021-11-15 BS 51.0 16672.0 0.0 173.0
15587 2021-11-15 AI 44.0 2115.0 1.0 19.0
15588 2021-11-15 ZH 410.0 148324.0 0.0 1378.0

12404 rows × 6 columns

1