def CDF_spline_fitting_forecast_precip_mean_v4(cmonth, clead, cdomain): """ CDF_spline_fitting_forecast_precip_mean_v4.py cmonth clead cdomain where cmonth = 'Jan', 'Feb' etc clead = '024' or similar (3 digits) and cdomain typically is "conus" Fit a cubic spline to cumulative hazard function of precipitation and save for spline parameters for fitting of precipitation CDFs. If the point is exceptionally dry, fit a Gamma distribution instead. This version produces spline fit for 5-member mean. This "flexiknot" version here makes a modification on the number of knots used in the spline fit, which is related to the number of samples with positive precipitation; more samples, more knots. Designed by Tom Hamill, NOAA, with help from Michael Scheuerer, Mar 2022 """ import os, sys from datetime import datetime import numpy as np import _pickle as cPickle from netCDF4 import Dataset import _pickle as cPickle import scipy.stats as stats from scipy.interpolate import LSQUnivariateSpline, splrep, splev import matplotlib.pyplot as plt from matplotlib import rcParams from mpl_toolkits.basemap import Basemap, interp from mpl_toolkits.axes_grid1 import make_axes_locatable # ===================================================================== def set_domain_boundaries(cdomain): """ used grib file of 2.5-km blend output grid to determine bounding lat and lon, and from that, the domain bounding indices for the 0.25 GEFSv12 reforecast data that will encompass the domain. """ if cdomain == 'conus': jmin = 93 jmax = 246 imin = 368 imax = 686 elif cdomain == 'pr': jmin = 243 jmax = 256 imin = 649 imax = 667 elif cdomain == 'ak': jmin = 19 jmax = 161 imin = 201 imax = 967 else: print ('invalid domain. Exiting.') sys.exit() return jmin, jmax, imin, imax # ===================================================================== def get_surrounding_months(cmonth): if cmonth == 'Jan': cmonth_early = 'Dec' cmonth_late = 'Feb' elif cmonth == 'Feb': cmonth_early = 'Jan' cmonth_late = 'Mar' elif cmonth == 'Mar': cmonth_early = 'Feb' cmonth_late = 'Apr' elif cmonth == 'Apr': cmonth_early = 'Mar' cmonth_late = 'May' elif cmonth == 'May': cmonth_early = 'Apr' cmonth_late = 'Jun' elif cmonth == 'Jun': cmonth_early = 'May' cmonth_late = 'Jul' elif cmonth == 'Jul': cmonth_early = 'Jun' cmonth_late = 'Aug' elif cmonth == 'Aug': cmonth_early = 'Jul' cmonth_late = 'Sep' elif cmonth == 'Sep': cmonth_early = 'Aug' cmonth_late = 'Oct' elif cmonth == 'Oct': cmonth_early = 'Sep' cmonth_late = 'Nov' elif cmonth == 'Nov': cmonth_early = 'Oct' cmonth_late = 'Dec' elif cmonth == 'Dec': cmonth_early = 'Nov' cmonth_late = 'Jan' else: print ('invalid month') sys.exit() return cmonth_early, cmonth_late # ===================================================================== def fraczero_possamps(nsamps, precip_ens): """ from the vector input sample precip_ens, define the fraction of samples with zero precipitation. For the positive samples, add a small random number to deal with the fact that the data was discretized to ~0.1 mm, so that when later creating CDFs we don't have empirical values with lots of tied amounts. Also, sort the nonzero amounts and return. """ number_zeros = 0 # data discretized, so add random component of this magnitude #precip_ens = np.where(precip_ens < 4.0, precip_ens* \ # np.random.uniform(low=-0.5,high=1.5,size=len(precip_ens)), precip_ens) precip_ens_nonzero = np.delete(precip_ens, \ np.where(precip_ens <= 0.0)) # censor at 0.0 mm precip_ens_nonzero = precip_ens_nonzero + \ np.random.uniform(low=-0.001,high=0.001,size=len(precip_ens_nonzero)) precip_ens_nonzero = np.delete(precip_ens_nonzero, \ np.where(precip_ens_nonzero <= 0.0)) # censor at 0.0 mm nz = len(precip_ens_nonzero) precip_ens_nonzero = np.sort(precip_ens_nonzero) ntotal = len(precip_ens) nzero = ntotal - len(precip_ens_nonzero) fraction_zero = float(nzero) / float(ntotal) return fraction_zero, precip_ens_nonzero, nz # ===================================================================== def define_knot_locations(nz, precip_samples_nonzero): # define_knot_locations: choose the number of knots and the indices # in the sorted precipitation samples according to the number of # positive precipitation amounts. We want fewer knots for small # samples, and we want the precipitation values of the chosen knots # to emphasize the upper quantiles of the distribution, as that's where # we care most about an accurate fit. # ---- these beta parameters be used to create a beta distribution # that will emphasize the upper quantiles of the distribution # rp = 3.5 rq = 1.0 # ---- the number of (interior) knots in the cubic spline will # be set to be no greater than 9.0, and for a sample size # of 100 will be 3. nknots = min([9,nz//30]) query_these_indices = [] cdf_at_indices = [] for iknot in range (1,nknots+1): rknot = float(iknot)/(nknots+1) xloc = stats.beta.ppf(rknot, rp, rq) c = stats.beta.cdf(xloc, rp, rq) iloc = int(nz*xloc) query_these_indices.append(iloc) c = (1./(2.*nz)) + float(iloc)/float(nz) cdf_at_indices.append(c) return nknots, query_these_indices, cdf_at_indices # ===================================================================== # ---- set parameters cmonth_list = ['Jan','Feb','Mar','Apr','May','Jun',\ 'Jul','Aug','Sep','Oct','Nov','Dec'] cmonthnum = ['01','02','03','03','04','05',\ '06','07','08','09','10','11','12'] print ('cmonth = ', cmonth) print (cmonth_list) imonth_index = cmonth_list.index(cmonth) jmin, jmax, imin, imax = set_domain_boundaries(cdomain) nstride = 1 cmonth_early, cmonth_late = get_surrounding_months(cmonth) pflag = False # for print statements master_directory = '/Volumes/NBM/'+cdomain+'_gefsv12/precip/netcdf/' master_directory_out = '/Volumes/NBM/'+cdomain+'_gefsv12/CDF_spline/' nmembers = 5 # ---- read in the previously generated netCDF file with precipitation # for this month and lead time + surrounding months. # All members, dates for this month have been # smushed into one leading index, dimension # nsamps, since the date of the forecast within the month and # the member number is irrelevant for the distribution fitting. ncfile = master_directory + cmonth + '_apcp_sfc_h' + clead + '.nc' print (ncfile) nc = Dataset(ncfile) precip_middle = nc.variables['apcp_fcst'][:,jmin:jmax,imin:imax] nsamps_middle, ny_gefsv12, nx_gefsv12 = np.shape(precip_middle) lons_1d = nc.variables['lons_fcst'][imin:imax] lats_1d = nc.variables['lats_fcst'][jmin:jmax] nc.close() ncfile = master_directory + cmonth_early + '_apcp_sfc_h' + clead + '.nc' print (ncfile) nc = Dataset(ncfile) precip_early = nc.variables['apcp_fcst'][:,jmin:jmax,imin:imax] nsamps_early, ny_gefsv12, nx_gefsv12 = np.shape(precip_early) nc.close() ncfile = master_directory + cmonth_late + '_apcp_sfc_h' + clead + '.nc' print (ncfile) nc = Dataset(ncfile) precip_late = nc.variables['apcp_fcst'][:,jmin:jmax,imin:imax] nsamps_late, ny_gefsv12, nx_gefsv12 = np.shape(precip_late) nc.close() # total samples of all members. nsamps = nsamps_middle + nsamps_early + nsamps_late nsamps_mean = nsamps//5 # ---- the first dimension of precip arrays are cases x members. # average over all the members precip_middle_mean = np.zeros((nsamps_middle//5, ny_gefsv12, nx_gefsv12), dtype=np.float64) for i in range(0, nsamps_middle,5): precip_middle_mean[i//5,:,:] = np.mean(precip_middle[i:i+5,:,:], axis=0) precip_early_mean = np.zeros((nsamps_early//5, ny_gefsv12, nx_gefsv12), dtype=np.float64) for i in range(0, nsamps_early,5): precip_early_mean[i//5,:,:] = np.mean(precip_early[i:i+5,:,:], axis=0) precip_late_mean = np.zeros((nsamps_late//5, ny_gefsv12, nx_gefsv12), dtype=np.float64) for i in range(0, nsamps_late,5): precip_late_mean[i//5,:,:] = np.mean(precip_late[i:i+5,:,:], axis=0) # ---- more initialization of output storage arrays now that # we know the array dimensions fzero = np.zeros((ny_gefsv12, nx_gefsv12), dtype=np.float) # ---- loop over the grid points and estimate the Gamma distributions # for each parameter. First see if a single Gamma distribution # is appropriate; if not, try a mixture of two. If that still # doesn't fit well, try a mixture of three. now = datetime.now() begin_time = now.strftime("%H:%M:%S") plotit = False spline_info_fcst = np.zeros((ny_gefsv12,nx_gefsv12,2,17), dtype=np.float64) usegamma_fcst = np.zeros((ny_gefsv12, nx_gefsv12), dtype=np.int32) q99 = np.zeros((ny_gefsv12,nx_gefsv12), dtype=np.float32) number_knots = np.zeros((ny_gefsv12,nx_gefsv12), dtype=np.int32) for jy in range(0,ny_gefsv12,nstride): #for jy in range(ny_gefsv12//2,ny_gefsv12//2+1): #for jy in range(528,529): now = datetime.now() current_time = now.strftime("%H:%M:%S") print ('***** begin time, current time, jy, ny_gefsv12, lat = ',\ begin_time, current_time, jy, ny_gefsv12, lats_1d[jy]) for ix in range(0,nx_gefsv12,nstride): #for ix in range (465,466): # ---- there is a grib round-off error that can give negative # values slightly smaller than teeny precip. to make sure # that we don't have either negative values or lots of the # same tiny values, so subtract teeny_precip precip_ens_1d_middle = precip_middle_mean[:,jy,ix] precip_ens_1d_early = precip_early_mean[:,jy,ix] precip_ens_1d_late = precip_late_mean[:,jy,ix] precip_ens_1d = np.concatenate((precip_ens_1d_middle, \ precip_ens_1d_early, precip_ens_1d_late)) tp = np.min( [np.abs(np.min(precip_ens_1d)), 0.0] ) teeny_precip = tp*np.ones(nsamps_mean) precip_ens_1d = precip_ens_1d - teeny_precip[:] fraction_zero, precip_ens_nonzero, nz = \ fraczero_possamps(nsamps_mean, precip_ens_1d) # return sorted fzero[jy,ix] = fraction_zero if pflag == True: print ('--------- jy, ix = ', jy,ix,' --------- ' ) if pflag == True: print (' precip_ens_nonzero[-1] = ', precip_ens_nonzero[-1]) if pflag == True: print (' nz, nsamps = ', nz, nsamps) if nz > 50: # ---- spline fit with the focus on knots at higher quantiles usegamma_fcst[jy,ix] = 0 q99[jy,ix] = precip_ens_nonzero[(99*nz)//100] empirical_cdf = 1.0/(2.0*nz) + np.arange(nz)/nz hazard_function_empirical = -np.log(1.0 - empirical_cdf) # ---- set the indices in the sorted precipitation sample # where spline knots are, the CDF at these values, # the number of knots, the precipitation amount at # chosen percentiles. nknots, query_these_indices, cdf_at_indices = \ define_knot_locations(nz, precip_ens_nonzero) empirical_precipvals = precip_ens_nonzero[query_these_indices] if pflag == True: print (' nknots = ', nknots) if pflag == True: print (' query_these_indices ', query_these_indices) if pflag == True: print (' cdf_at_indices = ', cdf_at_indices) spltemp_inv = splrep(precip_ens_nonzero, hazard_function_empirical, \ xb=0., task=-1, t = empirical_precipvals) lspline = len(spltemp_inv[0]) # final # knots including @ boundaries if pflag == True: print (' hazard_function_empirical = ',hazard_function_empirical) if pflag == True: print (' spltemp_inv = ', spltemp_inv) if pflag == True: print (' lspline = ', lspline) # --- set the number of knots and splines coefficients into arrays to # be dumped to netCDF file. spline_info_fcst[jy,ix,0,0:lspline] = spltemp_inv[0] spline_info_fcst[jy,ix,1,0:lspline] = spltemp_inv[1] number_knots[jy,ix] = lspline elif nz < 10: # --- over the whole training sample, there were not enough # nonzero precip values recorded. Flag this as missing data. usegamma_fcst[jy,ix] = -1 # flag as insufficient data to do any qmapping number_knots[jy,ix] = 0 spline_info_fcst[jy,ix,0,:] = -99.99 spline_info_fcst[jy,ix,1,:] = -99.99 q99[jy,ix] = -99.99 else: # ---- too few samples to use splines; fit a single-parameter Gamma # using Thom (1958) estimator described in Wilks textbook, # Statistical Methods in the Atmospheric Sciences. usegamma_fcst[jy,ix] = 1 # yes, estimate with Gamma distribution number_knots[jy,ix] = 0 empirical_cdf = 1.0/(2.0*nz) + np.arange(nz)/nz pmean = np.mean(precip_ens_nonzero) lnxbar = np.log(pmean) if pflag == True: print (' pmean, lnxbar = ', pmean, lnxbar) meanlnxi = np.mean(np.log(precip_ens_nonzero)) if pflag == True: print (' meanlnxi = ', meanlnxi) D = lnxbar - meanlnxi alpha_hat = (1.0 + np.sqrt(1.0+4.0*D/3.0)) / (4.0*D) beta_hat = pmean / alpha_hat if pflag == True: print(' alpha_hat, beta_hat = ', alpha_hat, beta_hat) spline_info_fcst[jy,ix,0,:] = alpha_hat # smoosh into the spline array spline_info_fcst[jy,ix,1,:] = beta_hat # smoosh into the spline array q99[jy,ix] = precip_ens_nonzero[-1] # --- evaluate Dn statistic, goodness of fit. y0 = precip_ens_nonzero / beta_hat fitted_CDF = stats.gamma.cdf(y0, alpha_hat) # ---- save spline information to netCDF file outfile = master_directory_out + cmonth_list[imonth_index]+'_'+cdomain+\ '_GEFSv12_mean_spline_info_h' + clead + '.nc' print ('writing to ',outfile) ncout = Dataset(outfile,'w',format='NETCDF4_CLASSIC') xf = ncout.createDimension('xf',nx_gefsv12) xvf = ncout.createVariable('xf','i4',('xf',)) xvf.long_name = "eastward grid point number on NDFD grid" xvf.units = "n/a" yf = ncout.createDimension('yf',ny_gefsv12) yvf = ncout.createVariable('yf','i4',('yf',)) yvf.long_name = "northward grid point number on NDFD grid" yvf.units = "n/a" xspd = ncout.createDimension('xspd',17) xspdf = ncout.createVariable('xspd','i4',('xspd',)) xspdf.long_name = "index for spline dimension" xspdf.units = "n/a" x2 = ncout.createDimension('x2',2) x2f = ncout.createVariable('x2','i4',('x2',)) x2f.long_name = "2nd index for spline dimension" x2f.units = "n/a" x9 = ncout.createDimension('x9',9) x9f = ncout.createVariable('x9','i4',('x9',)) x9f.long_name = "first dimension of nonzero_indices_of_knots" x9f.units = "n/a" lonsa = ncout.createVariable('lons','f4',('xf',)) lonsa.long_name = "longitude" lonsa.units = "degrees_east" latsa = ncout.createVariable('lats','f4',('yf',)) latsa.long_name = "latitude" latsa.units = "degrees_north" spline_info_out = ncout.createVariable('spline_info',\ 'f4',('yf','xf','x2','xspd'), zlib=True,least_significant_digit=6) spline_info_out.units = "n/a" spline_info_out.long_name = \ "Information for computing precipitation from"+\ "spline (or Gamma CDF for dry points)" spline_info_out.missing_value = \ np.array(-9999.99,dtype=np.float32) fzero_out = ncout.createVariable('fzero','f4',('yf','xf',), zlib=True,least_significant_digit=6) fzero_out.units = "n/a" fzero_out.long_name = "fraction_zero" fzero_out.missing_value = np.array(-9999.99,dtype=np.float32) usegamma_out = ncout.createVariable('usegamma',\ 'i4',('yf','xf',), zlib=True) usegamma_out.units = "n/a" usegamma_out.long_name = "1 if fit CDF via Gamma distribution, 0 if not" usegamma_out.missing_value = np.array(-99,dtype=np.int32) quantile_99 = ncout.createVariable('quantile_99',\ 'f8',('yf','xf',), zlib=True) quantile_99.units = "mm" quantile_99.long_name = "99th percentile of precipitation CDF" quantile_99.missing_value = np.array(-99.99,dtype=np.float32) numberknots_out = ncout.createVariable('number_knots',\ 'i4',('yf','xf',), zlib=True) numberknots_out.units = "n/a" numberknots_out.long_name = "number of knots when using spline" numberknots_out.missing_value = np.array(-99,dtype=np.int32) # ---- metadata ncout.title = "NDFD domain spline inverse coefficients / Gamma parameters " ncout.history = "from CDF fitting code by Tom Hamill, PSL" ncout.institution = "psl.noaa.gov" ncout.platform = "n/a" ncout.references = "n/a" # ---- copy the outputs to netCDF structures. xvf[:] = range(nx_gefsv12) yvf[:] = range(ny_gefsv12) xspdf[:] = range(17) x2f[:] = range(2) x9f[:] = range(9) lonsa[:] = lons_1d[:] latsa[:] = lats_1d[:] spline_info_out[:] = spline_info_fcst[:,:,:,:] fzero_out[:] = fzero[:,:] usegamma_out[:] = usegamma_fcst[:,:] quantile_99[:] = q99[:,:] numberknots_out[:] = number_knots[:,:] ncout.close()