62. Convert earthcare to xarray#
This notebook reads in an earthcare hdf5 file and converts it to an xarray dataset by adding dimensions, coordinates and variables. We store the height of the first ray, the time and the along-track distance as coordinates, and the longitude, latitude, reflectivity in dbZ and doppler velocity as variables.
The first 1000 km of the data have a height glitch, so in the second part of the notebook we show how to remove the bad data by cropping on the distance coordinate, rewrite the coordinates with the new height, distance coordinates, and write the dataset out to a netcdf file.
62.1. Installation#
Download a case folder from the satdata/earthcare folder on our gdrive folder
fetch and rebase from upstream/main to get this notebook in week11. You’ll want to modify this notebook to work with your case, so checkout and rebase on to your initials branch:
git checkout main git fetch upstream git rebase upstream/main git checkout your_initials git rebase main git push
To save your work:
git commit -m 'commit message' git push
from pathlib import Path
import xarray as xr
import numpy as np
import pyproj
from matplotlib import pyplot as plt
import datetime
import pytz
import pyproj
62.2. Find the case files#
Organize them in a dictionary keyed by the case name, which is at the start of the relative file path. pathlib has a function that finds the relative path, and also can break the folder path into its various parts
data_dir = Path().home() / 'repos/a301/satdata/earthcare'
radar_filepaths = list(data_dir.glob("**/*.h5"))
filepaths=dict()
for filepath in radar_filepaths:
relpath = filepath.relative_to(data_dir)
casename = relpath.parts[0]
filepaths[casename]=filepath
relpath
PosixPath('case4/ECA_JXCA_CPR_NOM_1B_20241213T224328Z_20250213T185124Z_03096D.h5')
62.2.1. This functions sorts the cases by case number#
def sort_keys(the_case):
#
# the case number starts after 'case' at position 4
#
number = the_case[4:]
#
# return case number as an integer so it will sort
# correctly
#
return int(number)
62.2.2. Pick case 4#
sorted_keys = list(filepaths.keys())
sorted_keys.sort(key=sort_keys)
casenum = 'case4'
62.3. Get coordinates#
The three functions below calculate vectors of height, time and distance that can be used as coordinates for axis labeling, and as indexes for selecting/cropping values.
def get_binheight(the_bins, index=0):
"""
select one of the the radar bin heights and fill in missing data with
estimated values. By default we'll
take index=0, i.e. the bin heights for the first pulse
Parameters
----------
"""
the_bins = the_bins[:,index].data
diff_bins = np.diff(the_bins)
del_y = float(np.nanmean(diff_bins))
new_y = [float(the_bins[0])]
for count,old_y in enumerate(the_bins[1:]):
#
# found a nan to be replaced
#
if np.isnan(old_y):
new_y.append(float(new_y[count]) + float(del_y))
else:
new_y.append(float(old_y))
return np.array(new_y)
def find_times(filepath):
"""
times are stored as seconds after Jan 1, 2000 UTC
use the datetime.timedelta function to increment from
that start time
"""
cpr_meta = xr.open_dataset(filepath, engine='h5netcdf', group='/ScienceData/Geo',
decode_times=False, phony_dims='access')
times = cpr_meta['profileTime'].data
start_time = datetime.datetime(2000,1,1,0,0,0)
the_times =[start_time + datetime.timedelta(seconds=item) for item in times]
#
# netcdf doesn't understand timezones so document the UTC timezone in
# the metadata
#
return the_times
great_circle=pyproj.Geod(ellps='WGS84')
meters2km = 1.e-3
def calc_distance(lonvec,latvec):
"""
the pyproj.Geod object calculates the lat, lon between
2 points on a great circle, and the inverse calculates
the great circle distance between 2 lot/lon points
"""
distance=[0]
startlon,startlat = lonvec[0],latvec[0]
for lon,lat in zip(lonvec[1:],latvec[1:]):
azi12,azi21,step= great_circle.inv(startlon,startlat,lon,lat)
distance.append(distance[-1] + step)
startlon,startlat = lon, lat
distance=np.array(distance)*meters2km
return distance
def ec_to_xarray(ec_filepath,casenum=0):
"""
open an earthcare h5 file and read the times, latitude, longitude, radar reflectivity
and the dopple velocity. Save this as an xarray dataset with coordinates and dimensions
"""
cpr_data = xr.open_dataset(ec_filepath, engine='h5netcdf', group='/ScienceData/Data',
decode_times=False, phony_dims='access')
cpr_data = cpr_data.rename_dims({'phony_dim_0':'distance','phony_dim_1': 'height'})
cpr_meta = xr.open_dataset(ec_filepath, engine='h5netcdf', group='/ScienceData/Geo',
decode_times=False, phony_dims='access')
cpr_meta = cpr_meta.rename_dims({'phony_dim_0':'distance','phony_dim_1': 'height'})
print(f"{cpr_meta.dims=}")
lonvec = cpr_meta['longitude']
latvec = cpr_meta['latitude']
distance = calc_distance(lonvec.data, latvec.data)
the_times = find_times(ec_filepath)
#
# create a DataArray from the list to get correct dimension
#
the_times = xr.DataArray(the_times,dims=('distance',))
binHeights = cpr_meta.binHeight[...].T
heights = get_binheight(binHeights,0)
coords = dict(height=("height",heights),
distance = ("distance",distance))
velocity = cpr_data['dopplerVelocity'].T
radar = cpr_data['radarReflectivityFactor'].T
dbZ = 10*np.log10(radar)
attrs=dict(history = f"written by ec_to_xarray on {str(datetime.datetime.now())}",
timezone="UTC",casenum=casenum)
var_dict = dict(dbZ = dbZ , velocity=velocity,binHeights=binHeights,
longitude=lonvec, latitude=latvec, time = the_times)
ds_earthcare = xr.Dataset(data_vars=var_dict,
coords=coords,attrs=attrs)
return ds_earthcare
62.4. Create the dataset#
Note that the dataset radar_ds has dimensions: (height, time, distance),
coordinates: (height, time and distance) and indices: (height, time, and distance).
They all have the same names but they serve different purposes. The dimensions
show the shape of the data, the coordinates allow for different axis labels,
and the indices can be used for selecting the data, as demonstrated below.
62.4.1. call ec_to_xarray#
filepaths[casenum]
radar_ds = ec_to_xarray(filepaths[casenum],casenum)
radar_ds
cpr_meta.dims=FrozenMappingWarningOnValuesAccess({'distance': 9946, 'height': 218, 'phony_dim_2': 1})
/Users/phil/mini310/envs/a301/lib/python3.13/site-packages/xarray/computation/apply_ufunc.py:821: RuntimeWarning: divide by zero encountered in log10
result_data = func(*input_data)
/Users/phil/mini310/envs/a301/lib/python3.13/site-packages/xarray/computation/apply_ufunc.py:821: RuntimeWarning: invalid value encountered in log10
result_data = func(*input_data)
<xarray.Dataset> Size: 26MB
Dimensions: (height: 218, distance: 9946)
Coordinates:
* height (height) float64 2kB 1.618e+04 1.608e+04 ... -5.501e+03
* distance (distance) float64 80kB 0.0 0.5002 1.0 ... 5.172e+03 5.173e+03
Data variables:
dbZ (height, distance) float32 9MB -39.15 -46.38 nan ... nan nan nan
velocity (height, distance) float32 9MB ...
binHeights (height, distance) float32 9MB ...
longitude (distance) float64 80kB -113.8 -113.8 -113.8 ... -131.1 -131.1
latitude (distance) float64 80kB 67.62 67.62 67.61 ... 22.38 22.37 22.37
time (distance) datetime64[ns] 80kB 2024-12-13T22:43:31.511866 ......
Attributes:
history: written by ec_to_xarray on 2025-03-29 09:50:41.584666
timezone: UTC
casenum: case462.4.2. Plot the radar reflectivity#
fig, ax = plt.subplots(1,1,figsize=(12,3))
radar_ds['dbZ'].plot(ax = ax, x="distance",y="height",vmin=-3,vmax=25);
ax.set_xlabel("distance (km)")
ax.set_ylabel("height (m)")
ax.set_title(casenum);
62.5. Data cleaning#
there’s some data loss in the upper atmosphere for the first 1000 km. When those data values appear at 1000 km it causes an abrupt change in the bin heights. If we really cared about the first 1000 km, we could split the dataset to keep both regions. Instead let’s just remove that data by cropping the first 1000 km.
62.5.1. First find the new index for the start#
The line below finds the index closest to 1000 km, which is index 1924. We want to use this to select our new set of heights, instead of the height values at index 0. np.argmin returns the index of the distance vector that has the smallest absolute difference from 1000.
new_index = np.argmin(np.abs(radar_ds.distance.data - 1000))
new_index
np.int64(1924)
62.5.2. Replace the height coordinates with the value at index 1924#
This function will overwrite the height coordinate with the
new value using assign_coords
def set_new_height(ds,target_index):
"""
replace the height coordinate with a new height
"""
height_array = ds['binHeights']
bin_height = get_binheight(height_array,target_index)
ds = ds.assign_coords({'height': bin_height})
return ds
radar_ds = set_new_height(radar_ds,new_index)
62.5.3. Plot with new height coordinate#
Now that we’re using the heights at 1000 km, the surface is at z=0 starting at 1000 km
fig, ax = plt.subplots(1,1,figsize=(12,3))
radar_ds['dbZ'].plot(ax = ax, x="distance",y="height",vmin=-3,vmax=25);
ax.set_xlabel("distance (km)")
ax.set_ylabel("height (m)")
ax.set_title(f"{casenum} with correct heights");
62.5.4. Crop first 1000 km#
Next remove the first 1000 km using the where function
radar_ds = radar_ds.where(radar_ds.coords['distance'] > 1000.,drop=True)
62.5.5. Cropped plot - original x axis#
fig, ax = plt.subplots(1,1,figsize=(12,3))
radar_ds['dbZ'].plot(ax = ax, x="distance",y="height",vmin=-3,vmax=25);
ax.set_xlabel("distance (km)")
ax.set_ylabel("height (m)")
ax.set_title(f"{casenum} cropped");
62.5.6. rewrite the distance coordinate#
Finally, make the new distance start from 0
new_distance = radar_ds.coords['distance'] - radar_ds.coords['distance'][0]
radar_ds = radar_ds.assign_coords({'distance': new_distance})
62.5.7. Cropped plot – new x axis#
fig, ax = plt.subplots(1,1,figsize=(12,3))
radar_ds['dbZ'].plot(ax = ax, x="distance",y="height",vmin=-3,vmax=25);
ax.set_xlabel("distance (km)")
ax.set_ylabel("height (m)")
ax.set_title(f"{casenum} new x coordinate");
fig, ax = plt.subplots(1,1,figsize=(12,3))
radar_ds['velocity'].plot(ax = ax, x="distance",y="height",vmin=-3,vmax=3);
ax.set_xlabel("distance (km)")
ax.set_ylabel("height (m)")
ax.set_title(f"{casenum} velocity");
62.6. Save the dataset to netcdf#
62.6.1. convert datetimes to numpy.datetime64 objects#
the netcdf writer doesn’t like python datetimes, but can handle numpy datetimes, so convert times to numpy datetime64.
new_times = [np.datetime64(item) for item in radar_ds.variables['time'].data]
#
# create a DataAr
#
new_times = xr.DataArray(new_times,dims=('distance',))
radar_ds['time']=new_times
filename = Path().home() / f"repos/a301/satdata/earthcare/week11_{casenum}_radar.nc"
if filename.exists():
filename.unlink()
radar_ds.to_netcdf(filename,'w')
62.6.1.1. Read it back in to check#
test_ds = xr.open_dataset(filename)
test_ds
<xarray.Dataset> Size: 21MB
Dimensions: (height: 218, distance: 8021)
Coordinates:
* height (height) float64 2kB 2.019e+04 2.009e+04 ... -1.494e+03
* distance (distance) float64 64kB 0.0 0.5027 1.006 ... 4.172e+03 4.172e+03
Data variables:
dbZ (height, distance) float32 7MB ...
velocity (height, distance) float32 7MB ...
binHeights (height, distance) float32 7MB ...
longitude (distance) float64 64kB ...
latitude (distance) float64 64kB ...
time (distance) datetime64[ns] 64kB ...
Attributes:
history: written by ec_to_xarray on 2025-03-29 09:50:41.584666
timezone: UTC
casenum: case4