Source code for salishsea_tools.loadDataFRP

# Copyright 2013 – present by the SalishSeaCast contributors
# and The University of British Columbia
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Load CTD cast data from Fraser River plume region that Elise Olson collected to ground-truth
her turbidity model."""

import os
import re
import string

import gsw
import netCDF4 as nc
import numpy as np
import pandas as pd
from scipy import signal as ssig

from salishsea_tools import geo_tools

# list CTD cnv files associated with cast numbers
cnvlist19 = {
    1: "fraser2017101.cnv",
    2: "fraser2017102.cnv",
    3: "fraser2017103.cnv",
    4: "fraser2017104.cnv",
    5: "fraser2017105.cnv",
    6: "fraser2017106.cnv",
    7: "fraser2017107.cnv",
    8: "fraser2017108.cnv",
    9: "fraser2017109.cnv",
    10: "fraser2017110.cnv",
    11: "fraser2017111.cnv",
    12: "fraser2017112.cnv",
    13: "fraser2017113.cnv",
    14.1: "fraser2017114.cnv",
    14.2: "fraser2017114.cnv",
    15: "fraser2017115.cnv",
    16: "fraser2017116.cnv",
    17: "fraser2017117.cnv",
    18: "fraser2017118.cnv",
    19: "fraser2017119.cnv",
    20: "fraser2017120.cnv",
    21: "fraser2017121.cnv",
    22: "fraser2017122.cnv",
    23: "fraser2017123.cnv",
    24: "fraser2017124.cnv",
}

cnvlist25 = {
    1: "fraser2017001.cnv",
    2: "fraser2017002.cnv",
    3: "fraser2017003.cnv",
    4: "fraser2017004.cnv",
    5: "fraser2017005.cnv",
    6: "fraser2017006.cnv",
    7: "fraser2017007.cnv",
    8: "fraser2017008.cnv",
    9: "fraser2017009.cnv",
    10: "fraser2017010.cnv",
    11: "fraser2017011.cnv",
    12: "fraser2017012.cnv",
    13: "fraser2017013.cnv",
    14.1: "fraser2017014.cnv",
    14.2: "fraser2017014.cnv",
    15: "fraser2017015.cnv",
    16: "fraser2017016.cnv",
    17: "fraser2017017.cnv",
    18: "fraser2017018.cnv",
    19: "fraser2017019.cnv",
    20: "fraser2017020.cnv",
    21: "fraser2017021.cnv",
    22: "fraser2017022.cnv",
    23: "fraser2017023.cnv",
    24: "fraser2017024.cnv",
}


class Cast:
    def __init__(self, fpath):
        mSta, mLat, mLon, df = readcnv(fpath)
        self.sta = mSta
        self.lat = mLat
        self.lon = mLon
        self.df = df
        self.source = fpath


class zCast:
    def __init__(self, updf, downdf):
        self.uCast = updf
        self.dCast = downdf


class rawCast:
    def __init__(self):
        self.uCast = dict()
        self.dCast = dict()


class dataPair:
    def __init__(self, zval, varval):
        self.z = zval
        self.val = varval



[docs]
def fmtVarName(strx):
    """transform string into one that meets python naming conventions"""
    vName = re.sub(r"[^a-zA-Z0-9_\-\s/]", "", strx.strip())
    vName = re.sub(r"[\s/]", "_", vName)
    vName = re.sub("-", "_", vName)
    if re.match("[0-9]", vName):
        vName = "_" + vName
    return vName



# def rolling_window(a, window):
#    # source: https://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
#    # use example: np.mean(rolling_window(x, 3), -1)
#    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
#    strides = a.strides + (a.strides[-1],)
#    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
#
# def rolling_window_padded(a,window):
#    # extend rolling window to be same lenth as input array by duplicating first and last values
#    # even values not symmetric
#    test=rolling_window(a,window)
#    while window>1:
#        if window%2==0:
#            test=np.concatenate(([test[0,:]],test),axis=0)
#        else:
#            test=np.concatenate((test,[test[-1,:]]),axis=0)
#        window+=-1
#    return test


def slidingWindowEval(x, func, window, axis=0):
    # x is input array
    # func is function to carry out over window
    # window is window size
    # axis is axis to act along, in case of multiple
    # if window is even, results will be shifted left by 1/2 unit
    x1 = np.lib.stride_tricks.sliding_window_view(x, window, axis)
    b = func(x1, -1)

    # the rest of the code pads the front and back to return an array of the same shape as the original
    nfront = np.floor((window - 1) / 2)
    nback = np.floor((window - 1) / 2) + (window - 1) % 2
    inxf = [slice(None)] * np.ndim(b)
    inxf[axis] = slice(0, 1, 1)
    inxb = [slice(None)] * np.ndim(b)
    inxb[axis] = slice(np.shape(b)[axis] - 1, np.shape(b)[axis], 1)
    repsf = np.ones(np.ndim(b), dtype=int)
    repsf[axis] = int(nfront)
    repsb = np.ones(np.ndim(b), dtype=int)
    repsb[axis] = int(nback)
    x2 = np.concatenate(
        (np.tile(b[tuple(inxf)], repsf), b, np.tile(b[tuple(inxb)], repsb)), axis=axis
    )
    return x2


def amp(var, dim=0):
    return np.nanmax(var, dim) - np.nanmin(var, dim)


def turbQC(x):
    # turbidity sensor produced erroneous zero readings interspersed with real data when too close to surface
    # remove suspect values from analysis
    # - median filter alone was not enough

    # remove a point if the max-min of the surrounding 5 point window
    # is greater than 1/3 the maximum turbidity value of the cast
    # (remove data within 5 points of a large jump)
    # ii1=amp(rolling_window_padded(x,5),-1)>.33*np.nanmax(x)
    ii1 = slidingWindowEval(x, amp, 5) > 0.33 * np.nanmax(x)  # was .5
    # remove data within 5 points of a near-zero turbidity value
    # ii2=np.nanmin(rolling_window_padded(x,5),-1)<.3
    ii2 = slidingWindowEval(x, np.nanmin, 5) < 0.3
    y = np.copy(x)
    y[
        np.logical_or(
            ii1,
            ii2,
        )
    ] = np.nan
    y = ssig.medfilt(y, 3)
    return y


def readcnv(fpath):
    alphnumlist = list(string.ascii_letters) + list(string.digits)
    # define regexes for reading headers:
    reSta = re.compile(
        r"(?<=\*\*\sStation:)\s?([0-9])+\s?"
    )  # assumes numeric station identifiers
    reLat = re.compile(r"(?<=\*\*\sLatitude\s=)\s?([\-0-9\.]+)\s([\-\.0-9]+)\s?([NS])")
    reLon = re.compile(r"(?<=\*\*\sLongitude\s=)\s?([\-0-9\.]+)\s([\-\.0-9]+)\s?([EW])")
    # start_time = May 08 2002 09:39:10
    reST = re.compile(r"(?<=\#\sstart_time\s=).*")
    # reTZ=re.compile('(?<=\*\*\s...\s\(Time\)\s=).*')
    # reCr=re.compile('(?<=\*\*\sCruise:).*')
    reNam = re.compile(r"(?<=\#\sname\s)([0-9]+)\s=\s(.*)\:\s?(.*)\s?")

    # define regex for finding searching:
    spStart = re.compile(r"^\s*[0-9]")  # starts with space characters followed by digit

    headers = list()
    # lineno=0
    mSta = None
    mLat = None
    mLon = None
    with open(fpath, "rt", encoding="ISO-8859-1") as f:
        for fline in f:
            if fline.startswith("**"):
                if reSta.search(fline):
                    mSta = reSta.search(fline).groups()
                if reLat.search(fline):
                    mLat = reLat.search(fline).groups()
                if reLon.search(fline):
                    mLon = reLon.search(fline).groups()
            elif reNam.search(fline):
                headers.append(fmtVarName(reNam.search(fline).groups(1)[1]))
            elif fline.startswith("*END*"):
                break
            # lineno+=1
        # still in with file open
        df = pd.read_csv(f, delim_whitespace=True, names=headers)
    # file closed

    return mSta, mLat, mLon, df


def bindepth(inP, inV, edges, targets=[], prebin=False):
    # calculate depth-associated variables
    # 1st calculate bin averages of depth and variable
    # then use np interp to estimate at-grid-point values
    # edges must be monotonically increasing
    if prebin == True:
        newP, newV = bindepth(
            inP, inV, np.arange(edges[0], edges[-1], 0.05), prebin=False
        )
        inP = newP
        inV = newV
    inP = inP[~np.isnan(inV)]
    inV = inV[~np.isnan(inV)]
    binned = np.digitize(inP, edges)
    Pa = np.empty(len(edges) - 1)
    Va = np.empty(len(edges) - 1)
    if len(targets) == 0:
        Pi = 0.5 * (edges[:-1] + edges[1:])
    else:
        Pi = targets[: (len(edges) - 1)]
    Vi = np.empty(len(edges) - 1)
    for jj in range(1, len(edges)):
        ll = binned == jj  # &(~np.isnan(inV))
        if np.sum(ll) > 0:
            Pa[jj - 1] = np.mean(inP[ll])
            Va[jj - 1] = np.mean(inV[ll])
        else:
            Pa[jj - 1] = np.nan
            Va[jj - 1] = np.nan
    # linearly extrapolate some values, but not beyond range of original data
    pnew = Pa[0] - (Pa[1] - Pa[0])
    vnew = Va[0] - (Va[1] - Va[0])
    Pa = np.concatenate(([pnew], Pa))
    Va = np.concatenate(([vnew], Va))
    Vi = np.interp(Pi, Pa[~np.isnan(Va)], Va[~np.isnan(Va)], right=np.nan, left=np.nan)
    Vi[Pi > np.max(inP)] = np.nan
    Vi[Pi < np.min(inP)] = np.nan
    return Pi, Vi


def cXfromX(X):
    X = np.array(X)
    X[np.isnan(X)] = -5
    Y = np.nan * X
    iii = (X > 0) & (X < 100)
    Y[iii] = -np.log(X[iii] / 100.0) / 0.25
    return Y


def turbReg(m, Cx, fl):
    return np.maximum(0.0, m[0] * Cx - m[1] * fl - m[2])


def turbFit(df0):
    # calculate conversion factor for sb19 ctd turbidity to ALS bottle turbidity
    # force through (0,0)
    x = df0.loc[(df0.ALS_Turb_NTU > 0) & (df0.sb19Turb_uncorrected > 0)][
        "sb19Turb_uncorrected"
    ].values
    x = x[:, np.newaxis]
    y = df0.loc[(df0.ALS_Turb_NTU > 0) & (df0.sb19Turb_uncorrected > 0)]["ALS_Turb_NTU"]
    tinv = np.linalg.lstsq(x, y, rcond=None)[0]
    tcor = 1.0 / tinv
    return tcor


def loadDataFRP_init(exp="all"):
    if exp not in {"exp1", "exp2", "exp3", "all"}:
        print("option exp=" + exp + " is not defined.")
        raise
    with open(
        "/ocean/shared/SalishSeaCastData/FRPlume/stationsDigitizedFinal.csv", "r"
    ) as fa:
        df0_a = pd.read_csv(fa, header=0, na_values="None")
    with open(
        "/ocean/shared/SalishSeaCastData/FRPlume/util/stationsDigitized_ancillary.csv",
        "r",
    ) as fb:
        df0_b = pd.read_csv(fb, header=0, na_values="None")
    df0 = pd.merge(df0_a, df0_b, how="left", on=["Station", "Date"])

    # if values present, calculate correction factor for sb19 turbidity (divide sb19 turbidity by tcor)
    # fit true turb to observed turb
    # calculate here while all values present
    if np.sum(df0.sb19Turb_uncorrected > 0) > 0:
        tcor = turbFit(df0)
    else:
        tcor = np.nan
    if exp == "exp1":
        df0 = df0.drop(df0.index[df0.Date != 20170410])
    elif exp == "exp2":
        df0 = df0.drop(df0.index[df0.Date != 20170531])
    elif exp == "exp3":
        df0 = df0.drop(df0.index[df0.Date != 20171101])

    basedir1 = "/ocean/shared/SalishSeaCastData/FRPlume/ctd/20170410/"
    basedir2 = "/ocean/shared/SalishSeaCastData/FRPlume/ctd/20170531/"
    basedir3 = "/ocean/shared/SalishSeaCastData/FRPlume/ctd/20171101/"
    dir19 = "19-4561/4_derive"
    dir25 = "25-0363/4_derive"
    dir19T10 = "19-4561/4b_deriveTEOS10"
    dir25T10 = "25-0363/4a_deriveTEOS10"

    clist = []
    if exp == "exp1" or exp == "all":
        clist = clist + list(range(1, 10))
    if exp == "exp2" or exp == "all":
        clist = clist + [10, 11, 12, 13, 14.1, 14.2, 15, 16, 17, 18]
    if exp == "exp3" or exp == "all":
        clist = clist + list(range(19, 25))

    fpath19 = dict()
    fpath25 = dict()
    for ii in clist:
        if ii < 10:
            fpath19[ii] = os.path.join(basedir1, dir19T10, cnvlist19[ii])
            fpath25[ii] = os.path.join(basedir1, dir25T10, cnvlist25[ii])
        elif ii < 19:
            fpath19[ii] = os.path.join(basedir2, dir19T10, cnvlist19[ii])
            fpath25[ii] = os.path.join(basedir2, dir25T10, cnvlist25[ii])
        else:
            fpath19[ii] = os.path.join(basedir3, dir19T10, cnvlist19[ii])
            fpath25[ii] = os.path.join(basedir3, dir25T10, cnvlist25[ii])

    cast19 = dict()
    cast25 = dict()
    for ii in clist:
        cast19[ii] = Cast(fpath19[ii])
        cast25[ii] = Cast(fpath25[ii])
    return df0, clist, tcor, cast19, cast25


def loadDataFRP(
    exp="all",
    sel="narrow",
    dp=1.0,
    form="binned",
    vert="P",
    meshPath="/data/eolson/results/MEOPAR/NEMO-forcing-new/grid/mesh_mask201702.nc",
):
    # exp determines which sampling date to load (or all)
    # sel determines whether to use narrow data selection, which is a more conservative estimate,
    #       or 'wide' data selection, which includes more near-surface data but can include
    #       some time the intstruments were parked near-surface
    # dp determines bin interval in terms of depth variable vert (only if form='binned')
    # form can be 'binned','raw' or 'SSCgrid' and determines if binned and how
    # vert determines vertical variable: 'P' or 'Z'
    # meshPath is SSC mesh file location for binning to model grid (form='SSCgrid')
    if exp not in {"exp1", "exp2", "exp3", "all"}:
        print("option exp=" + exp + " is not defined.")
        raise
    if sel == "narrow":
        prebin = False
    elif sel == "wide":
        prebin = True
    else:
        print("option sel=" + sel + " is not defined.")
        raise
    df0, clist, tcor, cast19, cast25 = loadDataFRP_init(exp=exp)
    parDZ = 0.78
    xmisDZ = 0.36
    turbDZ = 0.67
    pshiftdict = {
        "gsw_ctA0": 0.0,
        "gsw_srA0": 0.0,
        "xmiss": xmisDZ,
        "seaTurbMtr": turbDZ,
        "par": parDZ,
        "wetStar": 0.0,
        "sbeox0ML_L": 0.0,
        "seaTurbMtrnoQC": turbDZ,
        "turb_uncor": turbDZ,
        "turb": turbDZ,
    }
    # for SSC grid version, load model grid variables
    if form == "SSCgrid":
        with nc.Dataset(meshPath, "r") as mesh:
            tmask = mesh.variables["tmask"][0, :, :, :]
            gdept = mesh.variables["gdept_0"][0, :, :, :]
            gdepw = mesh.variables["gdepw_0"][0, :, :, :]
            nav_lat = mesh.variables["nav_lat"][:, :]
            nav_lon = mesh.variables["nav_lon"][:, :]

    zCasts = dict()
    for nn in clist:
        ip = np.argmax(cast25[nn].df["prSM"].values)
        ilag = df0.loc[df0.Station == nn, "ishift_sub19"].values[0]
        pS_pr = df0.loc[df0.Station == nn, "pS_pr"].values[0]
        pE_pr = df0.loc[df0.Station == nn, "pE_pr"].values[0]
        pS_tur = df0.loc[df0.Station == nn, "pStart25"].values[0]
        pE_tur = df0.loc[df0.Station == nn, "pEnd25"].values[0]
        lat = df0.loc[df0.Station == nn]["LatDecDeg"].values[0]
        lon = df0.loc[df0.Station == nn]["LonDecDeg"].values[0]
        if sel == "narrow":
            pS = pS_tur
            pE = pE_tur
        elif sel == "wide":
            pS = pS_pr
            pE = pE_pr

        cast19[nn].df["seaTurbMtrnoQC"] = cast19[nn].df["seaTurbMtr"]
        cast19[nn].df["turb_uncor"] = turbQC(cast19[nn].df["seaTurbMtr"])
        cast19[nn].df["turb"] = cast19[nn].df["turb_uncor"] * 1.0 / tcor

        if vert == "Z":
            # calc Z from p
            cast25[nn].df["Z"] = -1 * gsw.z_from_p(cast25[nn].df["prSM"].values, lat)
            cast19[nn].df["Z"] = -1 * gsw.z_from_p(cast19[nn].df["prdM"].values, lat)
            zvar25 = "Z"
        else:
            zvar25 = "prSM"
        pmax = cast25[nn].df.loc[ip, zvar25]
        if form == "binned" or form == "raw":
            edges = np.arange(dp / 2, pmax + dp, dp)
            targets = []  # use default value of 1/2-way between edges
        elif form == "SSCgrid":
            jj, ii = geo_tools.find_closest_model_point(lon, lat, nav_lon, nav_lat)
            edges = gdepw[:, jj, ii]  # model w grid
            targets = gdept[:, jj, ii]  # model T grid
            edges = edges[edges < pmax]
            targets = targets[: (len(edges) - 1)]

        if form == "binned" or form == "SSCgrid":
            dCast = pd.DataFrame()
            uCast = pd.DataFrame()
        elif form == "raw":
            zCasts[nn] = rawCast()
            dCast = zCasts[nn].dCast
            uCast = zCasts[nn].uCast

        def makeVCol(incast_p, outcast, var, i0, i1, incast_v=None):  # add in zvar25
            # incast_p: [cast25[nn]] sb25 cast to get pressure from; also used for var unless incast_v specified
            # outcast: [dCast or uCast] cast to save resulting aligned data to
            # i0, i1: [pS:ip/ip:pE] starting and ending indices for data to include relative to incast_p
            # var: variable name to extract
            # incast_v [cast19[nn]] if variables are from sbe19, specify that cast; otherwise get variable from incast_p (25)
            # vplag: ilag to align sbe19 to sbe25 in case where var is from sbe19; defaults to 0 for sbe25
            # NOTE: dataframes are mutable; changes to outcast propagate to its input df
            if incast_v is None:
                incast_v = incast_p
                vplag = 0
            else:
                vplag = ilag
            inP = incast_p.df.loc[i0:i1][zvar25].values - pshiftdict[var]  # p
            inV = incast_v.df.loc[(i0 + vplag) : (i1 + vplag)][var].values  # var
            if sel == "wide":
                inV[inP < 0.1] = np.nan
            if form == "binned" or form == "SSCgrid":
                p, out = bindepth(inP, inV, edges, targets=targets, prebin=prebin)
                if len(outcast) == 0:  # depth var not added yet
                    outcast[zvar25] = p
                    if form == "SSCgrid":
                        outcast["indk"] = np.arange(0, len(p))
                outcast[var] = out
            elif form == "raw":
                outcast[var] = dataPair(inP, inV)
            return

        # fix first 2 casts for which sb25 pump did not turn on. use sb19
        if nn == 1 or nn == 2:
            ##xmiss: xmis25=1.14099414691*xmis19+-1.6910134322
            cast19[nn].df["xmiss"] = (
                1.14099414691 * cast19[nn].df["CStarTr0"] - 1.6910134322
            )  # down var
            for var in ("gsw_ctA0", "gsw_srA0", "xmiss"):
                # downcast
                makeVCol(cast25[nn], dCast, var, pS, ip, incast_v=cast19[nn])
                # upcast
                makeVCol(cast25[nn], uCast, var, ip, pE, incast_v=cast19[nn])
            makeVCol(cast25[nn], dCast, "par", pS, ip)
            makeVCol(cast25[nn], uCast, "par", ip, pE)
            uCast["wetStar"] = np.nan
            dCast["wetStar"] = np.nan
            uCast["sbeox0ML_L"] = np.nan
            dCast["sbeox0ML_L"] = np.nan
        else:
            for var in (
                "gsw_ctA0",
                "gsw_srA0",
                "xmiss",
                "par",
                "wetStar",
                "sbeox0ML_L",
            ):
                if not nn == 14.2:
                    # downcast
                    makeVCol(cast25[nn], dCast, var, pS, ip)
                else:  # special case where there is no downcast
                    if len(dCast) == 0:  # depth var not added yet
                        dCast = pd.DataFrame(np.nan * np.ones(10), columns=[zvar25])
                    dCast[var] = np.nan * np.ones(10)
                if not nn == 14.1:
                    # upcast
                    makeVCol(cast25[nn], uCast, var, ip, pE)
                else:  # special case where there is no upcast
                    if len(uCast) == 0:  # depth var not added yet
                        uCast = pd.DataFrame(np.nan * np.ones(10), columns=[zvar25])
                    uCast[var] = np.nan * np.ones(10)
        for var in ("seaTurbMtrnoQC", "turb_uncor", "turb"):
            if not nn == 14.2:
                # turbidity downcast
                makeVCol(cast25[nn], dCast, var, pS, ip, incast_v=cast19[nn])
            else:  # special case where there is no downcast
                dCast[var] = np.nan * np.ones(10)
            if not nn == 14.1:
                # turbidity upcast
                makeVCol(cast25[nn], uCast, var, ip, pE, incast_v=cast19[nn])
            else:  # special case where there is no upcast
                uCast[var] = np.nan * np.ones(10)

        if form == "binned" or form == "SSCgrid":
            zCasts[nn] = zCast(uCast, dCast)

    return df0, zCasts


def loadDataFRP_raw(exp="all", sel="narrow"):
    df0, zCasts = loadDataFRP(exp=exp, sel=sel, vert="Z", form="raw")
    return df0, zCasts


def loadDataFRP_SSGrid(
    exp="all",
    sel="narrow",
    meshPath="/ocean/eolson/MEOPAR/NEMO-forcing/grid/mesh_mask201702.nc",
):
    df0, zCasts = loadDataFRP(
        exp=exp, sel=sel, vert="Z", form="SSCgrid", meshPath=meshPath
    )
    for ic in zCasts.values():
        ic.dCast["depth_m"] = ic.dCast["Z"]
        ic.uCast["depth_m"] = ic.uCast["Z"]
    return df0, zCasts