Exploring the Novel Coronavirus (COVID-19) Dataset

Exploratory Data Analysis of the Novel Coronavirus (COVID-19) Dataset
Data Science
COVID-19
Exploratory Data Analysis
Author

Daniel Fat

Published

April 5, 2020

Code
%%time
import os
f_name = 'novel-corona-virus-2019-dataset.zip'
if not os.path.exists(f_name):
  os.environ['KAGGLE_USERNAME'] = "" # username from the json file
  os.environ['KAGGLE_KEY'] = "" # key from the json file
  !kaggle datasets download -d sudalairajkumar/novel-corona-virus-2019-dataset
CPU times: user 325 ยตs, sys: 65 ยตs, total: 390 ยตs
Wall time: 605 ยตs
Code
import zipfile
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import networkx as nx
Code
df = dict()
with zipfile.ZipFile(f_name) as z:
  for fz in z.namelist():
    with z.open(fz) as f:
        df[fz.split('.')[0]] = pd.read_csv(f)
Code
df.keys()
dict_keys(['COVID19_line_list_data', 'COVID19_open_line_list', 'covid_19_data', 'time_series_covid_19_confirmed', 'time_series_covid_19_deaths', 'time_series_covid_19_recovered'])

First Preview over the data

Code
conf = df.get('time_series_covid_19_confirmed')
dth = df.get('time_series_covid_19_deaths')
recov = df.get('time_series_covid_19_recovered')
Code
conf[:5]
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 1/28/20 1/29/20 1/30/20 1/31/20 2/1/20 2/2/20 2/3/20 2/4/20 2/5/20 2/6/20 2/7/20 2/8/20 2/9/20 2/10/20 2/11/20 2/12/20 2/13/20 2/14/20 2/15/20 2/16/20 2/17/20 2/18/20 2/19/20 2/20/20 2/21/20 2/22/20 2/23/20 2/24/20 2/25/20 2/26/20 2/27/20 2/28/20 2/29/20 3/1/20 3/2/20 3/3/20 3/4/20 3/5/20 3/6/20 3/7/20 3/8/20 3/9/20 3/10/20 3/11/20 3/12/20 3/13/20 3/14/20 3/15/20 3/16/20 3/17/20 3/18/20 3/19/20 3/20/20 3/21/20 3/22/20 3/23/20 3/24/20 3/25/20 3/26/20 3/27/20
0 NaN Afghanistan 33.0000 65.0000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 5 7 7 7 11 16 21 22 22 22 24 24 40 40 74 84 94 110
1 NaN Albania 41.1533 20.1683 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 10 12 23 33 38 42 51 55 59 64 70 76 89 104 123 146 174 186
2 NaN Algeria 28.0339 1.6596 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 3 5 12 12 17 17 19 20 20 20 24 26 37 48 54 60 74 87 90 139 201 230 264 302 367 409
3 NaN Andorra 42.5063 1.5218 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 39 39 53 75 88 113 133 164 188 224 267
4 NaN Angola -11.2027 17.8739 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 3 3 3 4 4
Code
dth[:5]
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 1/28/20 1/29/20 1/30/20 1/31/20 2/1/20 2/2/20 2/3/20 2/4/20 2/5/20 2/6/20 2/7/20 2/8/20 2/9/20 2/10/20 2/11/20 2/12/20 2/13/20 2/14/20 2/15/20 2/16/20 2/17/20 2/18/20 2/19/20 2/20/20 2/21/20 2/22/20 2/23/20 2/24/20 2/25/20 2/26/20 2/27/20 2/28/20 2/29/20 3/1/20 3/2/20 3/3/20 3/4/20 3/5/20 3/6/20 3/7/20 3/8/20 3/9/20 3/10/20 3/11/20 3/12/20 3/13/20 3/14/20 3/15/20 3/16/20 3/17/20 3/18/20 3/19/20 3/20/20 3/21/20 3/22/20 3/23/20 3/24/20 3/25/20 3/26/20 3/27/20
0 NaN Afghanistan 33.0000 65.0000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 4 4
1 NaN Albania 41.1533 20.1683 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 2 2 2 2 2 4 5 5 6 8
2 NaN Algeria 28.0339 1.6596 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 4 4 7 9 11 15 17 17 19 21 25 26
3 NaN Andorra 42.5063 1.5218 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 3 3
4 NaN Angola -11.2027 17.8739 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Code
recov[:5]
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 1/28/20 1/29/20 1/30/20 1/31/20 2/1/20 2/2/20 2/3/20 2/4/20 2/5/20 2/6/20 2/7/20 2/8/20 2/9/20 2/10/20 2/11/20 2/12/20 2/13/20 2/14/20 2/15/20 2/16/20 2/17/20 2/18/20 2/19/20 2/20/20 2/21/20 2/22/20 2/23/20 2/24/20 2/25/20 2/26/20 2/27/20 2/28/20 2/29/20 3/1/20 3/2/20 3/3/20 3/4/20 3/5/20 3/6/20 3/7/20 3/8/20 3/9/20 3/10/20 3/11/20 3/12/20 3/13/20 3/14/20 3/15/20 3/16/20 3/17/20 3/18/20 3/19/20 3/20/20 3/21/20 3/22/20 3/23/20 3/24/20 3/25/20 3/26/20 3/27/20
0 NaN Afghanistan 33.0000 65.0000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 2 2
1 NaN Albania 41.1533 20.1683 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 10 17 17 31
2 NaN Algeria 28.0339 1.6596 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 8 12 12 12 12 12 32 32 32 65 65 24 65 29 29
3 NaN Andorra 42.5063 1.5218 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
4 NaN Angola -11.2027 17.8739 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Code
conf_time_cols = conf.columns.tolist()[4:]
dth_time_cols = dth.columns.tolist()[4:]
recov_time_cols = recov.columns.tolist()[4:]
Code
conf_graph = nx.Graph()
dth_graph = nx.Graph()
recov_graph = nx.Graph()
Code
for i in range(conf.__len__()):
    conf_graph.add_edge(conf.iloc[i]['Country/Region'],conf.iloc[i]['Province/State'],relation='state-level')
    conf_graph.add_edge(conf.iloc[i]['Province/State'],conf.iloc[i][conf_time_cols[-1]],relation='number-level')

Confirmed Cases at a Country-State Level

Code
plt.figure(figsize=(50,50))
nx.draw_networkx(conf_graph)

Death Cases at a Country-State Level

Code
for i in range(dth.__len__()):
    dth_graph.add_edge(dth.iloc[i]['Country/Region'],dth.iloc[i]['Province/State'],relation='state-level')
    dth_graph.add_edge(conf.iloc[i]['Province/State'],conf.iloc[i][dth_time_cols[-1]],relation='number-level')
Code
plt.figure(figsize=(50,50))
nx.draw_networkx(dth_graph)

Recovered Cases at a Country-State Level

Code
for i in range(recov.__len__()):
    recov_graph.add_edge(dth.iloc[i]['Country/Region'],dth.iloc[i]['Province/State'],relation='state-level')
    recov_graph.add_edge(conf.iloc[i]['Province/State'],conf.iloc[i][recov_time_cols[-1]],relation='number-level')
Code
plt.figure(figsize=(50,50))
nx.draw_networkx(recov_graph)

Overview over the Confirmed Cases by time

Code
conf[conf_time_cols].describe().boxplot(figsize=(55,20))

Overview over the Death Cases by time

Code
dth[dth_time_cols].describe().boxplot(figsize=(55,20))

Overview over the Recovered Cases by time

Code
recov[recov_time_cols].describe().boxplot(figsize=(55,20))

Latest Confirmed Cases Grouped by Country/Region and Province/State

Code
conf.loc[:,['Country/Region','Province/State',conf_time_cols[-1]]].groupby(['Country/Region','Province/State']).agg({
    conf_time_cols[-1]: 'sum'
})
3/27/20
Country/Region Province/State
Australia Australian Capital Territory 62
New South Wales 1405
Northern Territory 12
Queensland 555
South Australia 257
... ... ...
United Kingdom Cayman Islands 8
Channel Islands 88
Gibraltar 55
Isle of Man 29
Montserrat 5

76 rows ร— 1 columns

Latest Death Cases Grouped by Country/Region and Province/State

Code
dth.loc[:,['Country/Region','Province/State',dth_time_cols[-1]]].groupby(['Country/Region','Province/State']).agg({
    dth_time_cols[-1]: 'sum'
})
3/27/20
Country/Region Province/State
Australia Australian Capital Territory 0
New South Wales 7
Northern Territory 0
Queensland 1
South Australia 0
... ... ...
United Kingdom Cayman Islands 1
Channel Islands 1
Gibraltar 0
Isle of Man 0
Montserrat 0

76 rows ร— 1 columns

Latest Recovered Cases Grouped by Country/Region and Province/State

Code
recov.loc[:,['Country/Region','Province/State',recov_time_cols[-1]]].groupby(['Country/Region','Province/State']).agg({
    recov_time_cols[-1]: 'sum'
})
3/27/20
Country/Region Province/State
Australia Australian Capital Territory 1
New South Wales 4
Northern Territory 0
Queensland 8
South Australia 6
... ... ...
United Kingdom Cayman Islands 0
Channel Islands 0
Gibraltar 14
Isle of Man 0
Montserrat 0

61 rows ร— 1 columns

Code
total_cases = conf[conf_time_cols[-1]].sum()
total_deaths = dth[dth_time_cols[-1]].sum()
total_recovs = recov[recov_time_cols[-1]].sum()
Code
total_cases, total_deaths, total_recovs
(593291, 27198, 130915)
Code
death_rate = (total_deaths / total_cases) * 100.0
recov_rate = (total_recovs / total_cases) * 100.0
Code
plt.figure(figsize=(10,10))
objects = ('Death Ratio', 'Recover Ratio')
y_pos = np.arange(len(objects))
performance = [death_rate,recov_rate]

plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('%')
plt.title('Death vs Recover Ratio')

plt.show()

Code
plt.figure(figsize=(10,10))
objects = ('Total Confirmed','Total Deaths', 'Total Recovered')
y_pos = np.arange(len(objects))
performance = [total_cases, total_deaths, total_recovs]

plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.title("Numbers of " + " - ".join(objects))

plt.show()

Code
import plotly.express as px
Code
conf['cases'] = conf[conf_time_cols[-1]]
dth['cases'] = dth[dth_time_cols[-1]]
recov['cases'] = recov[recov_time_cols[-1]]
Code
hov_data = ["Province/State", "Country/Region","cases"]

Map of Confirmed cases on the last available day

Code
plt.figure(figsize=(25,25))
fig = px.scatter_mapbox(conf, lat="Lat", lon="Long", hover_name="Province/State", hover_data=hov_data,
                        color_discrete_sequence=["fuchsia"],zoom=3)
fig.update_layout(mapbox_style="carto-darkmatter")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
<Figure size 1800x1800 with 0 Axes>

Map of Death cases on the last available day

Code
plt.figure(figsize=(50,50))
fig = px.scatter_mapbox(dth, lat="Lat", lon="Long", hover_name="Province/State", hover_data=hov_data,
                        color_discrete_sequence=["fuchsia"],zoom=3)
fig.update_layout(mapbox_style="carto-darkmatter")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
<Figure size 3600x3600 with 0 Axes>

Map of Recovered cases on the last available day

Code
plt.figure(figsize=(25,25))
fig = px.scatter_mapbox(recov, lat="Lat", lon="Long", hover_name="Province/State", hover_data=hov_data,
                        color_discrete_sequence=["fuchsia"],zoom=3)
fig.update_layout(mapbox_style="carto-darkmatter")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
<Figure size 1800x1800 with 0 Axes>