""" CDF_fitting_mswep_precip_spline.py cmonth cend_hour this python script is designed to spline fit an empirical CDF of precipitation. The script is tailored to MSWEP precipitation analyses over one of the National Digital Forecast Database domains for the National Blend of Models Designed by Tom Hamill, NOAA, with help from Michael Scheuerer, Dec 2020 """ import os, sys from datetime import datetime import numpy as np import _pickle as cPickle from netCDF4 import Dataset import _pickle as cPickle import scipy.stats as stats from gammamix import gammamix_em import scipy.stats as stats from scipy.interpolate import LSQUnivariateSpline, splrep, splev # ===================================================================== def set_domain_boundaries(cdomain): """ used grib file of 2.5-km blend output grid to determine bounding lat and lon, and from that, the domain bounding indices for the 0.25 GEFSv12 reforecast data that will encompass the domain. """ if cdomain == 'conus': jmin = 93 jmax = 246 imin = 368 imax = 686 elif cdomain == 'pr': jmin = 243 jmax = 256 imin = 649 imax = 667 elif cdomain == 'ak': jmin = 19 jmax = 161 imin = 201 imax = 967 else: print ('invalid domain. Exiting.') sys.exit() return jmin, jmax, imin, imax # ===================================================================== def find_nearest(vec, value): """ given a vector vec and a particular value, find the index in vec that is nearest to value""" idx = np.abs(vec-value).argmin() return idx # ===================================================================== def fraczero_possamps(nsamps, precip_ens): """ from the vector input sample precip_ens, define the fraction of samples with zero precipitation. For the positive samples, add a small random number to deal with the fact that the data was discretized to ~0.1 mm, so that when later creating CDFs we don't have empirical values with lots of tied amounts. Also, sort the nonzero amounts and return. """ precip_ens_nonzero = np.delete(precip_ens, \ np.where(precip_ens <= 0.0)) # censor at 0.0 mm precip_ens_nonzero = precip_ens_nonzero + \ np.random.uniform(low=-0.01,high=0.01,size=len(precip_ens_nonzero)) precip_ens_nonzero = np.delete(precip_ens_nonzero, \ np.where(precip_ens_nonzero <= 0.0)) # censor at 0.0 mm nz = len(precip_ens_nonzero) #precip_ens_nonzero = precip_ens_nonzero + \ # np.random.uniform(low=-0.005,high=0.005,size=nz) precip_ens_nonzero = np.sort(precip_ens_nonzero) ntotal = len(precip_ens) nzero = ntotal - len(precip_ens_nonzero) fraction_zero = float(nzero) / float(ntotal) return fraction_zero, precip_ens_nonzero, nz # ===================================================================== # ---- inputs from command line cmonth = sys.argv[1] # '01', '02' etc. cend_hour = sys.argv[2] # 06, 12, 18, 00 -- end hour of 6-h period imonth = int(cmonth) - 1 nstride = 1 cdomain = 'conus' excessive_threshold = 0.035 # ---- set parameters pflag = False # for print statements master_directory = '/Volumes/Backup Plus/mswep/' ndaysomo = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] ndaysomo_leap = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] cmonths_early = ['12','01','02','03','04','05','06','07','08','09','10','11'] cmonths_late = ['02','03','04','05','06','07','08','09','10','11','12','01'] # ---- determine the overall number of daily precipitation # samples across all years for this month iearly = int(cmonths_early[imonth])-1 ilate = int(cmonths_late[imonth])-1 if imonth != 1: # not Feb nsamps_mid = ndaysomo[imonth]*20 else: nsamps_mid = 6*ndaysomo_leap[imonth] + 14*ndaysomo[imonth] if iearly != 1: # not Feb nsamps_early = ndaysomo[iearly]*20 else: nsamps_early = 6*ndaysomo_leap[iearly] + 14*ndaysomo[iearly] if ilate != 1: # not Feb nsamps_late = ndaysomo[ilate]*20 else: nsamps_late = 6*ndaysomo_leap[ilate] + 14*ndaysomo[ilate] nsamps = nsamps_mid + nsamps_early + nsamps_late print ('nsamps = ', nsamps) # ---- read in the previously generated netCDF file with precipitation # for this month and lead time as well as the surrounding # two months. All dates for this month have # been smushed into one leading index, dimension nsamps, # since the date of the forecast within the month and # the member number is irrelevant for the distribution # fitting. ktr = 0 for iyear in range(2000,2020): for cmo in [cmonth, cmonths_early[imonth], cmonths_late[imonth]]: imo = int(cmo)-1 if iyear%4 == 0: ndays = ndaysomo_leap[imo] else: ndays = ndaysomo[imo] cyear = str(iyear) infile = master_directory + cyear + cmo + \ '_on_ndfd_grid_6hourly.nc' print (iyear, infile, ndays) nc = Dataset(infile) yyyymmddhh_end = nc.variables['yyyymmddhh_end'][:] #print ('yyyymmddhh_end = ', yyyymmddhh_end) for iday in range(1,ndays+1): if iday < 10: cday = '0'+str(iday) else: cday = str(iday) iyyyymmddhh = int(str(iyear)+cmo+cday+cend_hour) #print (iyyyymmddhh) idx = np.where(yyyymmddhh_end == iyyyymmddhh)[0] precip_in = np.squeeze(nc.variables['apcp_anal'][idx,:,:]) if iyear == 2000 and iday == 1 and cmo == cmonth: nyin, nxin = np.shape(precip_in) precip_tseries = np.zeros((nsamps,nyin,nxin), \ dtype=np.float64) lons = nc.variables['lons'][:,:] lats = nc.variables['lats'][:,:] precip_tseries[ktr,:,:] = precip_in[:,:] ktr = ktr+1 nc.close() # ---- loop over the grid points and estimate the spline coefficients now = datetime.now() begin_time = now.strftime("%H:%M:%S") spline_info = np.zeros((nyin,nxin,2,17), dtype=np.float64) spline_info_inv = np.zeros((nyin,nxin,2,17), dtype=np.float64) indices_to_query = np.zeros((nyin,nxin,9), dtype=np.float16) Dnstat = 0.10*np.ones((nyin, nxin), dtype=np.float) fzero = np.zeros((nyin, nxin), dtype=np.float) cdf_at_indices = np.asarray([ 0.1, 0.25, 0.33333, 0.5, 0.65, 0.8, 0.85, 0.9, 0.95]) for jy in range(0,nyin,nstride): #for jy in range(0, 1): now = datetime.now() current_time = now.strftime("%H:%M:%S") print ('***** begin time, current time, jy, nyin, lat = ',\ begin_time, current_time, jy, nyin, lats[jy,0]) for ix in range(0,nxin,nstride): #for ix in range (492,493): # ---- there is a grib round-off error that can give negative # values slightly smaller than teeny precip. to make sure # that we don't have either negative values or lots of the # same tiny values, subtractoff teeny_precip precip_ens_1d = precip_tseries[:,jy,ix] tp = np.min( [np.abs(np.min(precip_ens_1d)), 0.0] ) teeny_precip = tp*np.ones(nsamps) precip_ens_1d = precip_ens_1d - teeny_precip[:] fraction_zero, precip_ens_nonzero, nz = \ fraczero_possamps(nsamps, precip_ens_1d) # return sorted if pflag == True: print (' precip_ens_nonzero[0:-1] = ',\ precip_ens_nonzero[0:-1]) fzero[jy,ix] = fraction_zero empirical_cdf = 1.0/(2.0*nz) + np.arange(nz)/nz if nz > 40: # ---- spline fit the CDF to the precipitation values via # Michael Scheuerer's hazard function (see Fig 3 in # https://doi.org/10.1175/MWR-D-20-0096.1. ) query_these_indices = [ nz//10, nz//4, nz//3, nz//2, (3*nz)//5, \ (4*nz)//5, (17*nz)//20, (9*nz)//10, (19*nz)//20] indices_to_query[jy,ix,:] = query_these_indices[:] empirical_precipvals = precip_ens_nonzero[query_these_indices] hazard_function_empirical = -np.log(1.0-empirical_cdf) spltemp = splrep(precip_ens_nonzero, hazard_function_empirical, \ xb=0., task=-1, t = empirical_precipvals) spline_hazard = splev(precip_ens_nonzero, spltemp) spline_cdf = 1.0 - np.exp(-spline_hazard) # ---- save spline information to numpy array, spline_info[jy,ix,0,:] = spltemp[0] spline_info[jy,ix,1,:] = spltemp[1] # ---- in the subsequent quantile mapping, we will want the # analyzed precipitation amount given the quantile. # Accordingly, let's also reverse the data in the spline # and get the spline fits of precipitation amount to the cdf. hazard_function_at_indices = -np.log(1.0-cdf_at_indices) spltemp_inv = splrep(hazard_function_empirical, precip_ens_nonzero, \ xb=0., task=-1, t = cdf_at_indices) spline_info_inv[jy,ix,0,:] = spltemp_inv[0] spline_info_inv[jy,ix,1,:] = spltemp_inv[1] # --- evaluate Dn statistic, goodness of fit. diff = np.abs(empirical_cdf - spline_cdf) Dnstat[jy,ix] = np.max(diff) else: # ---- too few samples; fit a single-parameter Gamma using Thom # estimator described in Wilks textbook, Statistical # Methods in the Atmospheric Sciences. pmean = np.mean(precip_ens_nonzero) lnxbar = np.log(pmean) meanlnxi = np.mean(np.log(precip_ens_nonzero)) D = lnxbar - meanlnxi alpha_hat = (1.0 + np.sqrt(1.0+4.0*D/3.0)) / (4.0*D) beta_hat = pmean / alpha_hat indices_to_query[jy,ix,:] = -1 # flag for using Gamma spline_info[jy,ix,0,:] = alpha_hat # smoosh into the spline array spline_info[jy,ix,1,:] = beta_hat # smoosh into the spline array # --- evaluate Dn statistic, goodness of fit. y0 = precip_ens_nonzero / beta_hat fitted_CDF = stats.gamma.cdf(y0, alpha_hat) diff = np.abs(empirical_cdf - fitted_CDF) Dnstat[jy,ix] = np.max(diff) # --- save to cPickle file outfile = master_directory + cmonth+'_'+cdomain+\ '_MSWEP_spline_info_h' + cend_hour + '.cPick' print ('writing to ', outfile) ouf = open(outfile, 'wb') cPickle.dump(spline_info, ouf) cPickle.dump(spline_info_inv, ouf) cPickle.dump(fzero, ouf) cPickle.dump(indices_to_query, ouf) ouf.close() outfile = master_directory + cmonth+'_'+cdomain+\ '_MSWEP_Dnstat_h' + cend_hour + '.cPick' print ('writing to ', outfile) ouf = open(outfile, 'wb') cPickle.dump(Dnstat, ouf) cPickle.dump(lons, ouf) cPickle.dump(lats, ouf) ouf.close()