Data Wrangling COVID-19 Mini-Challenge¶

Imports/Setup¶

1	import pandas as pd
2	import wikidata_plain_sparql as wikidata
3
4	from bokeh.palettes import turbo
5	from bokeh.plotting import figure, output_notebook, show, gridplot
6	from bokeh.models import ColumnDataSource, HoverTool
7
8	# set bokeh output mode to notebook
9	output_notebook()
10
11	from helper import get_jhu_cached, create_grid, get_bokeh_tools

Loading BokehJS ...

Worldwide COVID-19 Data¶

1	# get all dates from earliest available data until yesterday
2	all_dates = pd.date_range(start='2020-01-22', end='today')[:-1]
3
4	# standardize column names for all entries
5	def rename_columns(column):
6	column_map = {
7	'Lat': 'Latitude',
8	'Long_': 'Longitude',
9	'Incidence_Rate': 'Incident_Rate'
10	}
11	if column in column_map:
12	return column_map[column]
13	return column.replace('/', '_').replace('-', '_').replace(' ', '_')
14
15	# load data from all dates
16	all_data = []
17	for date in all_dates:
18	data = pd.read_csv(get_jhu_cached(date))
19	data.rename(columns=rename_columns, inplace=True)
20	data['Date'] = date
21	all_data.append(data)
22
23	# combine data from all days
24	raw_data = pd.concat(all_data)

1	# standardize country names for all entries
2	country_mapping = {
3	'MS Zaandam\|Diamond Princess\|Cruise Ship': 'Others', # move cruise ships to others
4	'Hong Kong.+': 'Hong Kong',
5	'Iran.+': 'Iran',
6	'.Congo.': 'Congo',
7	'Mainland China': 'China',
8	'.Bahamas.': 'The Bahamas',
9	'.Gambia.': 'The Gambia',
10	'Viet Nam': 'Vietnam',
11	'Taiwan\*': 'Taiwan',
12	'Cote d\'Ivoire': 'Ivory Coast',
13	'Cabo Verde': 'Cape Verde',
14	'Russian Federation': 'Russia',
15	' Azerbaijan': 'Azerbaijan',
16	'Holy See': 'Vatican City',
17	'Republic of Ireland': 'Ireland',
18	'Republic of Moldova': 'Moldova',
19	'Czechia': 'Czech Republic',
20	'Republic of Korea\|Korea, South': 'South Korea',
21	'Timor-Leste': 'East Timor',
22	'Macao SAR\|Macau': 'Macao',
23	'UK': 'United Kingdom',
24	'Jersey\|Guernsey': 'Channel Islands',
25	'Dominican Republicn Republic\|Dominica': 'Dominican Republic'
26	}
27
28	raw_data['Country_Region'].replace(to_replace=country_mapping.keys(), value=country_mapping.values(), regex=True, inplace=True)

1	# group data by country
2	daily_updates = raw_data.groupby(['Country_Region', 'Date']).agg(
3	Confirmed = ('Confirmed','sum'),
4	Deaths = ('Deaths','sum'),
5	).reset_index()
6
7	# get changes in data
8	updates_per_country = daily_updates.groupby('Country_Region')
9	daily_updates['New_Confirmed'] = updates_per_country['Confirmed'].diff().fillna(0)
10	daily_updates['New_Deaths'] = updates_per_country['Deaths'].diff().fillna(0)
11
12	worldwide_pretty = daily_updates.loc[:, ['Date', 'Country_Region', 'New_Confirmed', 'Confirmed', 'New_Deaths', 'Deaths']]

1	# get all countries incl. colors
2	all_countries = daily_updates['Country_Region'].sort_values().unique()
3	colors = turbo(all_countries.size)

1	# show new cases
2	tools = get_bokeh_tools(tooltips=[
3	('Country', '@country'),
4	('Date', '@x{%F}'),
5	('New Cases', '@y{0,0}'),
6	])
7
8	new_cases_graph = figure(
9	title='New COVID-19 cases per country',
10	y_axis_label='new cases',
11	x_axis_type='datetime',
12	sizing_mode='stretch_width',
13	tools=tools)
14
15	i = 0
16	for country in all_countries:
17	updates_for_country = daily_updates.loc[daily_updates['Country_Region'] == country]
18	data = ColumnDataSource(data={
19	'x': updates_for_country['Date'],
20	'y': updates_for_country['New_Confirmed'],
21	'country': updates_for_country['Country_Region']
22	})
23	new_cases_graph.line('x', 'y', source=data, line_color=colors[i], line_width=2)
24	i += 1
25	show(new_cases_graph)

1	# show total cases
2	tools = get_bokeh_tools(tooltips=[
3	('Country', '@country'),
4	('Date', '@x{%F}'),
5	('Total Cases', '@y{0,0}'),
6	])
7
8	total_cases_graph = figure(
9	title='Total COVID-19 cases per country',
10	y_axis_label='total cases',
11	x_axis_type='datetime',
12	sizing_mode='stretch_width',
13	tools=tools)
14
15	i = 0
16	for country in all_countries:
17	updates_for_country = daily_updates.loc[daily_updates['Country_Region'] == country]
18	data = ColumnDataSource(data={
19	'x': updates_for_country['Date'],
20	'y': updates_for_country['Confirmed'],
21	'country': updates_for_country['Country_Region']
22	})
23	total_cases_graph.line('x', 'y', source=data, line_color=colors[i], line_width=2)
24	i += 1
25	show(total_cases_graph)

Swiss COVID-19 Data¶

1	# get population data from WikiData
2	canton_data = wikidata.query('''
3	SELECT ?shortCode ?population ?canton WHERE {
4	?canton wdt:P31 wd:Q23058.
5	?canton wdt:P300 ?shortCode.
6	OPTIONAL {
7	?canton p:P1082 ?population_stmt.
8	?population_stmt ps:P1082 ?population.
9	?population_stmt pq:P585 ?population_date.
10	}
11	FILTER NOT EXISTS {
12	?canton p:P1082/pq:P585 ?population_date_.
13	FILTER (?population_date_ > ?population_date)
14	}
15	}
16	ORDER BY ?shortCode
17	''')
18	canton_data.set_index('shortCode', inplace=True)

1	raw_data = pd.read_csv('https://raw.githubusercontent.com/openZH/covid_19/master/COVID19_Fallzahlen_CH_total_v2.csv')
2
3	# convert to date
4	raw_data['date'] = pd.to_datetime(raw_data['date'])
5
6	# remove FL
7	swiss = raw_data[raw_data['abbreviation_canton_and_fl'] != 'FL']
8
9	# only use data after 1st of june
10	swiss = swiss[swiss['date'] >= '2020-06-01']
11
12	# only keep useful entries
13	conf_cases = swiss.loc[-swiss['ncumul_conf'].isna()].copy()
14
15	# get all cantons
16	all_cantons = conf_cases['abbreviation_canton_and_fl'].unique()
17	all_cantons.sort()
18
19	# calculate new cases/deaths
20	conf_cases_per_canton = conf_cases.groupby('abbreviation_canton_and_fl')
21	conf_cases['new_cases'] = conf_cases_per_canton['ncumul_conf'].diff().fillna(0)
22	conf_cases['new_deaths'] = conf_cases_per_canton['ncumul_deceased'].diff().fillna(0)
23
24	# calculate total cases/deaths
25	conf_cases['total_cases'] = conf_cases_per_canton['new_cases'].cumsum()
26	conf_cases['total_deaths'] = conf_cases_per_canton['new_deaths'].cumsum()
27
28	# calculate values per 100'000 residents
29	def relative_values(group):
30	canton = group.name
31	return group.div(int(canton_data.at['CH-' + canton, 'population']) / 100000)
32
33	conf_cases['new_cases_relative'] = conf_cases_per_canton['new_cases'].apply(relative_values)
34	conf_cases['total_cases_relative'] = conf_cases_per_canton['total_cases'].apply(relative_values)
35
36	swiss_pretty = conf_cases.loc[:, ['date', 'abbreviation_canton_and_fl', 'new_cases', 'total_cases', 'new_deaths', 'total_deaths']]

New COVID-19 cases per 100'0000 residents¶

1	# calculate new cases
2	graphs = []
3	max_new_cases = conf_cases['new_cases_relative'].max()
4	for canton in all_cantons:
5	update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton].copy()
6
7	update_for_canton['new_cases_relative_avg'] = update_for_canton['new_cases_relative'].rolling(window=7).mean()
8
9	new_cases_graph = figure(title=canton, y_axis_label='new cases', y_range=[0, max_new_cases], x_axis_type='datetime')
10	new_cases_graph.line(update_for_canton['date'], update_for_canton['new_cases_relative'], line_width=1)
11	new_cases_graph.line(update_for_canton['date'], update_for_canton['new_cases_relative_avg'], line_color='red', line_width=1)
12	graphs.append(new_cases_graph)
13
14	show(create_grid(graphs, sizing_mode='scale_width'))

Total COVID-19 cases since 1st of june per 100'000 residents¶

1	# calculate total cases
2	graphs = []
3	max_total_cases = conf_cases['total_cases_relative'].max()
4	for canton in all_cantons:
5	update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
6
7	total_cases_graph = figure(title=canton, y_axis_label='total cases', y_range=[0, max_total_cases], x_axis_type='datetime')
8	total_cases_graph.line(update_for_canton['date'], update_for_canton['total_cases_relative'], line_width=1)
9	graphs.append(total_cases_graph)
10
11	show(create_grid(graphs, sizing_mode='scale_width'))

Final Data Frames¶

1	worldwide_pretty

	Date	Country_Region	New_Confirmed	Confirmed	New_Deaths	Deaths
0	2020-02-24	Afghanistan	0.0	5.0	0.0	0.0
1	2020-02-25	Afghanistan	0.0	5.0	0.0	0.0
2	2020-02-26	Afghanistan	0.0	5.0	0.0	0.0
3	2020-02-27	Afghanistan	0.0	5.0	0.0	0.0
4	2020-02-28	Afghanistan	0.0	5.0	0.0	0.0
...	...	...	...	...	...	...
119753	2020-03-12	occupied Palestinian territory	0.0	0.0	0.0	0.0
119754	2020-03-14	occupied Palestinian territory	0.0	0.0	0.0	0.0
119755	2020-03-15	occupied Palestinian territory	0.0	0.0	0.0	0.0
119756	2020-03-16	occupied Palestinian territory	0.0	0.0	0.0	0.0
119757	2020-03-17	occupied Palestinian territory	0.0	0.0	0.0	0.0

119758 rows × 6 columns

1	swiss_pretty

	date	abbreviation_canton_and_fl	new_cases	total_cases	new_deaths	total_deaths
2371	2020-06-01	BL	0.0	0.0	0.0	0.0
2372	2020-06-01	FR	0.0	0.0	0.0	0.0
2373	2020-06-01	GR	0.0	0.0	0.0	0.0
2374	2020-06-01	JU	0.0	0.0	0.0	0.0
2375	2020-06-01	NE	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...
15584	2021-11-15	TG	1.0	28303.0	0.0	486.0
15585	2021-11-15	UR	80.0	4345.0	0.0	47.0
15586	2021-11-15	BS	51.0	16672.0	0.0	173.0
15587	2021-11-15	AI	44.0	2115.0	1.0	19.0
15588	2021-11-15	ZH	410.0	148324.0	0.0	1378.0

12404 rows × 6 columns

1