{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"toc": true
},
"source": [
"
Table of Contents
\n",
""
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import numpy.random as rn\n",
"import numpy.testing as nt\n",
"import datetime as dt\n",
"from datetime import timezone as tz\n",
"import pprint"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dataframe practice\n",
"\n",
"Learning objectives\n",
"\n",
"- Be able to use the apply method to execute a function on every row of a dataframe\n",
"- Be able to add a column to a data frame and use groupby with that column to \n",
" group dataframe rows into subset dataframes\n",
"- Be able to do simple statistics (mean, median, max, min, summary) on dataframes and dataframe series\n",
"- Be able to construct dataframes from lists of tuples, lists of dictionaries, or numpy arrays using\n",
" from_records member function\n",
"- Be able to create and work with python datetime objects\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create a simulated dataset\n",
"\n",
"This cell creates 3 columns: column 0 is temperatures\n",
"in the range 263-283 K, column 1 is veloc in the range\n",
"-100 to +100 dm/s, column 2 is mass in the range "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 2.67439863e+02 7.41464612e+01 2.06719155e-01]\n",
" [ 2.81372218e+02 -2.31776224e+00 6.11743863e-01]\n",
" [ 2.78318157e+02 3.68359757e+00 2.96800502e-01]\n",
" [ 2.66754425e+02 -8.38517462e+01 7.38440296e-01]\n",
" [ 2.71826184e+02 -6.83380265e+01 8.79937031e-01]]\n"
]
}
],
"source": [
"my_seed = 5\n",
"rn.seed(my_seed)\n",
"uni_dist = rn.rand(25,3)\n",
"uni_dist[:,:2] = (uni_dist[:,:2] - 0.5)*2.\n",
"temps = uni_dist[:,0]*10. + 273.\n",
"veloc = uni_dist[:,1]*100.\n",
"uni_dist[:,0]=temps\n",
"uni_dist[:,1]=veloc\n",
"print(uni_dist[:5,:])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Convert to a dataframe\n",
"\n",
"with columns=['temperature','veloc','mass']\n",
"\n",
"### Q1: Turn uni_dist into a dataframe called df\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" temperature | \n",
" veloc | \n",
" mass | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 267.439863 | \n",
" 74.146461 | \n",
" 0.206719 | \n",
"
\n",
" \n",
" 1 | \n",
" 281.372218 | \n",
" -2.317762 | \n",
" 0.611744 | \n",
"
\n",
" \n",
" 2 | \n",
" 278.318157 | \n",
" 3.683598 | \n",
" 0.296801 | \n",
"
\n",
" \n",
" 3 | \n",
" 266.754425 | \n",
" -83.851746 | \n",
" 0.738440 | \n",
"
\n",
" \n",
" 4 | \n",
" 271.826184 | \n",
" -68.338026 | \n",
" 0.879937 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" temperature veloc mass\n",
"0 267.439863 74.146461 0.206719\n",
"1 281.372218 -2.317762 0.611744\n",
"2 278.318157 3.683598 0.296801\n",
"3 266.754425 -83.851746 0.738440\n",
"4 271.826184 -68.338026 0.879937"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Q1 answer\n",
"columns=['temperature','veloc','mass']\n",
"df=pd.DataFrame(uni_dist,columns=columns)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It should pass this test"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"nt.assert_allclose(df['veloc'][1],-2.3,rtol=0.1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create a datetime for each row\n",
"\n",
"Assume that each row represents a measurement, and that\n",
"the measurements happened every 10 minutes atarting at\n",
"5 am UCT on March 25, 2019"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"start and time interval: 2019-03-25 05:00:00+00:00 -- 0:10:00\n",
"[datetime.datetime(2019, 3, 25, 5, 0, tzinfo=datetime.timezone.utc),\n",
" datetime.datetime(2019, 3, 25, 5, 10, tzinfo=datetime.timezone.utc),\n",
" datetime.datetime(2019, 3, 25, 5, 20, tzinfo=datetime.timezone.utc),\n",
" datetime.datetime(2019, 3, 25, 5, 30, tzinfo=datetime.timezone.utc),\n",
" datetime.datetime(2019, 3, 25, 5, 40, tzinfo=datetime.timezone.utc)]\n"
]
}
],
"source": [
"start_time=dt.datetime(2019,3,25,5,0,0,tzinfo=tz.utc)\n",
"interval=dt.timedelta(minutes=10)\n",
"print(f'start and time interval: {start_time} -- {interval}')\n",
"time_list=[start_time]\n",
"for tick in range(len(df)-1):\n",
" time_list.append(time_list[tick] + interval)\n",
"pprint.pprint(time_list[:5])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Make time_list the index to the dataframe\n",
"\n",
"### Add it as a new column"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df['date']=time_list"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q2 -- set this column as an index\n",
"\n",
"google pandas.DataFrame.set_index for method arguments"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\\begin{tabular}{lrrr}\n",
"\\toprule\n",
"{} & temperature & veloc & mass \\\\\n",
"date & & & \\\\\n",
"\\midrule\n",
"2019-03-25 05:00:00+00:00 & 267.439863 & 74.146461 & 0.206719 \\\\\n",
"2019-03-25 05:10:00+00:00 & 281.372218 & -2.317762 & 0.611744 \\\\\n",
"2019-03-25 05:20:00+00:00 & 278.318157 & 3.683598 & 0.296801 \\\\\n",
"2019-03-25 05:30:00+00:00 & 266.754425 & -83.851746 & 0.738440 \\\\\n",
"2019-03-25 05:40:00+00:00 & 271.826184 & -68.338026 & 0.879937 \\\\\n",
"\\bottomrule\n",
"\\end{tabular}\n",
"\n"
]
}
],
"source": [
"#Q2 answer\n",
"df['date']=time_list\n",
"df.set_index('date',inplace=True)\n",
"df.head()\n",
"print(df.head().to_latex())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Your dataframe should pass this assert."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"target_row=start_time=dt.datetime(2019,3,25,5,40,0,tzinfo=tz.utc)\n",
"nt.assert_allclose(df.loc[target_row]['temperature'],271.8,rtol=1.e-2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q3 write a function that rounds a column value to 1 decimal\n",
"\n",
"i.e. complete this function\n",
"\n",
"```\n",
"def coarse_grain(row,colname):\n",
" #\n",
" # input -- dataframe row called row and string colname\n",
" #\n",
" # returns -- row[colname] rounded to 1 decimal\n",
" return col_val\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Q3 answer\n",
"def coarse_grain(row,colname):\n",
" col_val = np.around(row[colname],decimals=1)\n",
" return col_val"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Your function should pass this assert"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"the_row = df.iloc[2]\n",
"the_round = coarse_grain(the_row,'mass')\n",
"np.testing.assert_allclose(the_round,0.3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q4 Add a new column called \"mass_tenths\" using coarse_grain\n",
"\n",
"i.e. use df.apply to run coarse_grain on the dataframe and add the result\n",
"as the 'mass_tenths' column\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#Q4 answer\n",
"df['mass_tenths']=\\\n",
" df.apply(coarse_grain,args=('mass',),axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You dataframe should now pass the following assert"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"nt.assert_allclose(df['mass_tenths'][3],0.7)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q5: Group the measurements on mass_tenths"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"Use groups=df.groupby to create a DataFrameGroupBy object\n",
"called groups indexed on mass_tenths\n",
"\n",
"Then find the median temperature, speed and mass of each group\n",
"using the the_median=groups.median() method\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"#Q5 answer\n",
"groups=df.groupby('mass_tenths')\n",
"the_median=groups.median()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"nt.assert_allclose(the_median.loc[0.2]['veloc'],1.49,rtol=0.1)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" temperature | \n",
" veloc | \n",
" mass | \n",
"
\n",
" \n",
" mass_tenths | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0.0 | \n",
" 280.409662 | \n",
" 84.549923 | \n",
" 0.002214 | \n",
"
\n",
" \n",
" 0.1 | \n",
" 276.975578 | \n",
" -14.607772 | \n",
" 0.090364 | \n",
"
\n",
" \n",
" 0.2 | \n",
" 274.237067 | \n",
" 1.489661 | \n",
" 0.188541 | \n",
"
\n",
" \n",
" 0.3 | \n",
" 268.481729 | \n",
" -17.152996 | \n",
" 0.296080 | \n",
"
\n",
" \n",
" 0.4 | \n",
" 272.389767 | \n",
" 96.293748 | \n",
" 0.398945 | \n",
"
\n",
" \n",
" 0.5 | \n",
" 270.220968 | \n",
" -92.898194 | \n",
" 0.546358 | \n",
"
\n",
" \n",
" 0.6 | \n",
" 275.635321 | \n",
" 3.094522 | \n",
" 0.599929 | \n",
"
\n",
" \n",
" 0.7 | \n",
" 265.120278 | \n",
" -71.470318 | \n",
" 0.719142 | \n",
"
\n",
" \n",
" 0.8 | \n",
" 279.274650 | \n",
" -48.180481 | \n",
" 0.795087 | \n",
"
\n",
" \n",
" 0.9 | \n",
" 273.766654 | \n",
" -2.978774 | \n",
" 0.903310 | \n",
"
\n",
" \n",
" 1.0 | \n",
" 265.229076 | \n",
" -49.750978 | \n",
" 0.964915 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" temperature veloc mass\n",
"mass_tenths \n",
"0.0 280.409662 84.549923 0.002214\n",
"0.1 276.975578 -14.607772 0.090364\n",
"0.2 274.237067 1.489661 0.188541\n",
"0.3 268.481729 -17.152996 0.296080\n",
"0.4 272.389767 96.293748 0.398945\n",
"0.5 270.220968 -92.898194 0.546358\n",
"0.6 275.635321 3.094522 0.599929\n",
"0.7 265.120278 -71.470318 0.719142\n",
"0.8 279.274650 -48.180481 0.795087\n",
"0.9 273.766654 -2.978774 0.903310\n",
"1.0 265.229076 -49.750978 0.964915"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"groups.median()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q6 Round trip\n",
"\n",
"Add a cell that fixes the following problem -- new_df is not\n",
"the same as our original df after writing it out to csv"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\\begin{tabular}{lrrrr}\n",
"\\toprule\n",
"{} & temperature & veloc & mass & mass\\_tenths \\\\\n",
"date & & & & \\\\\n",
"\\midrule\n",
"2019-03-25 05:00:00+00:00 & 267.439863 & 74.146461 & 0.206719 & 0.2 \\\\\n",
"2019-03-25 05:10:00+00:00 & 281.372218 & -2.317762 & 0.611744 & 0.6 \\\\\n",
"2019-03-25 05:20:00+00:00 & 278.318157 & 3.683598 & 0.296801 & 0.3 \\\\\n",
"2019-03-25 05:30:00+00:00 & 266.754425 & -83.851746 & 0.738440 & 0.7 \\\\\n",
"2019-03-25 05:40:00+00:00 & 271.826184 & -68.338026 & 0.879937 & 0.9 \\\\\n",
"\\bottomrule\n",
"\\end{tabular}\n",
"\n"
]
}
],
"source": [
"new_df=pd.read_csv('test.csv')\n",
"new_df.head()\n",
"print(df.head().to_latex())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" temperature | \n",
" veloc | \n",
" mass | \n",
" mass_tenths | \n",
"
\n",
" \n",
" date | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 2019-03-25 05:00:00+00:00 | \n",
" 267.439863 | \n",
" 74.146461 | \n",
" 0.206719 | \n",
" 0.2 | \n",
"
\n",
" \n",
" 2019-03-25 05:10:00+00:00 | \n",
" 281.372218 | \n",
" -2.317762 | \n",
" 0.611744 | \n",
" 0.6 | \n",
"
\n",
" \n",
" 2019-03-25 05:20:00+00:00 | \n",
" 278.318157 | \n",
" 3.683598 | \n",
" 0.296801 | \n",
" 0.3 | \n",
"
\n",
" \n",
" 2019-03-25 05:30:00+00:00 | \n",
" 266.754425 | \n",
" -83.851746 | \n",
" 0.738440 | \n",
" 0.7 | \n",
"
\n",
" \n",
" 2019-03-25 05:40:00+00:00 | \n",
" 271.826184 | \n",
" -68.338026 | \n",
" 0.879937 | \n",
" 0.9 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" temperature veloc mass mass_tenths\n",
"date \n",
"2019-03-25 05:00:00+00:00 267.439863 74.146461 0.206719 0.2\n",
"2019-03-25 05:10:00+00:00 281.372218 -2.317762 0.611744 0.6\n",
"2019-03-25 05:20:00+00:00 278.318157 3.683598 0.296801 0.3\n",
"2019-03-25 05:30:00+00:00 266.754425 -83.851746 0.738440 0.7\n",
"2019-03-25 05:40:00+00:00 271.826184 -68.338026 0.879937 0.9"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Q6 answer\n",
"new_df=pd.read_csv('test.csv',index_col='date')\n",
"new_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"lines_to_next_cell": 2
},
"outputs": [],
"source": [
"target_row=start_time=dt.datetime(2019,3,25,5,40,0,tzinfo=tz.utc)\n",
"nt.assert_allclose(df.loc[target_row]['temperature'],271.8,rtol=1.e-2)"
]
}
],
"metadata": {
"jupytext": {
"cell_metadata_filter": "all",
"notebook_metadata_filter": "all,-language_info",
"text_representation": {
"extension": ".py",
"format_name": "percent",
"format_version": "1.2",
"jupytext_version": "1.0.4"
}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 2
}