Source code for mpas_analysis.shared.climatology.climatology

# This software is open source software available under the BSD-3 license.
#
# Copyright (c) 2022 Triad National Security, LLC. All rights reserved.
# Copyright (c) 2022 Lawrence Livermore National Security, LLC. All rights
# reserved.
# Copyright (c) 2022 UT-Battelle, LLC. All rights reserved.
#
# Additional copyright and license information can be found in the LICENSE file
# distributed with this code, or at
# https://raw.githubusercontent.com/MPAS-Dev/MPAS-Analysis/main/LICENSE
"""
Functions for creating climatologies from monthly time series data
"""
# Authors
# -------
# Xylar Asay-Davis

import xarray as xr
import os
import numpy
from tempfile import TemporaryDirectory

from mpas_tools.io import write_netcdf
from pyremap import Remapper, LatLonGridDescriptor, ProjectionGridDescriptor

from mpas_analysis.shared.constants import constants

from mpas_analysis.shared.timekeeping.utility import days_to_datetime

from mpas_analysis.shared.io.utility import build_config_full_path, \
    make_directories, fingerprint_generator
from mpas_analysis.shared.io import write_netcdf_with_fill

from mpas_analysis.shared.climatology.comparison_descriptors import \
    get_comparison_descriptor, known_comparison_grids



[docs]
def get_remapper(config, sourceDescriptor, comparisonDescriptor,
                 mappingFilePrefix, method, logger=None, vertices=False):
    """
    Given config options and descriptions of the source and comparison grids,
    returns a ``pyremap.Remapper`` object that can be used to remap from source
    files or data sets to corresponding data sets on the comparison grid.

    If necessary, creates the mapping file containing weights and indices
    needed to perform remapping.

    Parameters
    ----------
    config : tranche.Tranche
        Contains configuration options

    sourceDescriptor : pyremap.MeshDescriptor
        A description of the source mesh or grid

    comparisonDescriptor : pyremap.MeshDescriptor
        A description of the comparison grid

    mappingFilePrefix : str
        A prefix to be prepended to the mapping file name

    method : {'bilinear', 'neareststod', 'conserve'}
        The method of interpolation used.

    logger : logging.Logger, optional
        A logger to which ncclimo output should be redirected

    vertices : bool, optional
        Whether to remap from vertices, rather than cells

    Returns
    -------
    remapper : pyremap.Remapper
        A remapper that can be used to remap files or data sets from the source
        grid or mesh to the comparison grid.
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    mappingFileName = None

    if not _matches_comparison(sourceDescriptor, comparisonDescriptor):
        # we need to remap because the grids don't match

        if vertices:
            srcMeshName = f'{sourceDescriptor.mesh_name}_vertices'
        else:
            srcMeshName = sourceDescriptor.mesh_name
        destMeshName = comparisonDescriptor.mesh_name

        mappingBaseName = \
            f'{mappingFilePrefix}_{srcMeshName}_to_{destMeshName}_{method}.nc'

        tryCustom = config.get('diagnostics', 'customDirectory') != 'none'
        if tryCustom:
            # first see if mapping files are in the custom directory
            mappingSubdirectory = build_config_full_path(
                config, 'diagnostics', 'mappingSubdirectory',
                baseDirectoryOption='customDirectory')

            mappingFileName = f'{mappingSubdirectory}/{mappingBaseName}'

        if not tryCustom or not os.path.exists(mappingFileName):
            # second see if mapping files are in the base directory

            mappingSubdirectory = build_config_full_path(
                config, 'diagnostics', 'mappingSubdirectory',
                baseDirectoryOption='base_path')

            mappingFileName = f'{mappingSubdirectory}/{mappingBaseName}'

        if not os.path.exists(mappingFileName):
            # we don't have a mapping file yet, so get ready to create one
            # in the output subfolder if needed
            mappingSubdirectory = \
                build_config_full_path(config, 'output',
                                       'mappingSubdirectory')
            make_directories(mappingSubdirectory)
            mappingFileName = f'{mappingSubdirectory}/{mappingBaseName}'

    mpiTasks = config.getint('execute', 'mapMpiTasks')
    esmf_parallel_exec = config.get('execute', 'mapParallelExec')
    if esmf_parallel_exec == 'None':
        esmf_parallel_exec = None

    remapper = Remapper(
        ntasks=mpiTasks,
        map_filename=mappingFileName,
        method=method,
        parallel_exec=esmf_parallel_exec,
        src_descriptor=sourceDescriptor,
        dst_descriptor=comparisonDescriptor,
    )

    if mappingFileName is not None and not os.path.exists(mappingFileName):
        mappingSubdirectory = \
            build_config_full_path(
                config, 'output', 'mappingSubdirectory')
        make_directories(mappingSubdirectory)
        with TemporaryDirectory(dir=mappingSubdirectory) as tempdir:
            remapper.src_scrip_filename = os.path.join(
                tempdir, remapper.src_scrip_filename)
            remapper.dst_scrip_filename = os.path.join(
                tempdir, remapper.dst_scrip_filename)

            # TEMP: logger not supported in this RC
            remapper.build_map(logger=logger)

    return remapper




[docs]
def compute_monthly_climatology(ds, calendar=None, maskVaries=True):
    """
    Compute monthly climatologies from a data set.  The mean is weighted but
    the number of days in each month of the data set, ignoring values masked
    out with NaNs.  If the month coordinate is not present, a data array
    ``month`` will be added based on ``Time`` and the provided calendar.

    Parameters
    ----------
    ds : xarray.Dataset or xarray.DataArray
        A data set with a ``Time`` coordinate expressed as days since
        0001-01-01 or ``month`` coordinate

    calendar : {'gregorian', 'noleap'}, optional
        The name of one of the calendars supported by MPAS cores, used to
        determine ``month`` from ``Time`` coordinate, so must be supplied if
        ``ds`` does not already have a ``month`` coordinate or data array

    maskVaries : bool, optional
        If the mask (where variables in ``ds`` are ``NaN``) varies with time.
        If not, the weighted average does not need make extra effort to account
        for the mask.  Most MPAS fields will have masks that don't vary in
        time, whereas observations may sometimes be present only at some
        times and not at others, requiring ``maskVaries = True``.

    Returns
    -------
    climatology : object of same type as ``ds``
        A data set without the ``'Time'`` coordinate containing the mean
        of ds over all months in monthValues, weighted by the number of days
        in each month.
    """

    # Authors
    # -------
    # Xylar Asay-Davis

    def compute_one_month_climatology(ds):
        monthValues = list(ds.month.values)
        return compute_climatology(ds, monthValues, calendar, maskVaries)

    ds = add_years_months_days_in_month(ds, calendar)

    monthlyClimatology = \
        ds.groupby('month').map(compute_one_month_climatology)

    return monthlyClimatology




[docs]
def compute_climatology(ds, monthValues, calendar=None,
                        maskVaries=True):
    """
    Compute a monthly, seasonal or annual climatology data set from a data
    set.  The mean is weighted but the number of days in each month of
    the data set, ignoring values masked out with NaNs.  If the month
    coordinate is not present, a data array ``month`` will be added based
    on ``Time`` and the provided calendar.

    Parameters
    ----------
    ds : xarray.Dataset or xarray.DataArray
        A data set with a ``Time`` coordinate expressed as days since
        0001-01-01 or ``month`` coordinate

    monthValues : int or array-like of ints
        A single month or an array of months to be averaged together

    calendar : {'gregorian', 'noleap'}, optional
        The name of one of the calendars supported by MPAS cores, used to
        determine ``month`` from ``Time`` coordinate, so must be supplied if
        ``ds`` does not already have a ``month`` coordinate or data array

    maskVaries : bool, optional
        If the mask (where variables in ``ds`` are ``NaN``) varies with time.
        If not, the weighted average does not need make extra effort to account
        for the mask.  Most MPAS fields will have masks that don't vary in
        time, whereas observations may sometimes be present only at some
        times and not at others, requiring ``maskVaries = True``.

    Returns
    -------
    climatology : object of same type as ``ds``
        A data set without the ``'Time'`` coordinate containing the mean
        of ds over all months in monthValues, weighted by the number of days
        in each month.
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    ds = add_years_months_days_in_month(ds, calendar)

    mask = xr.zeros_like(ds.month, bool)

    for month in monthValues:
        mask = numpy.logical_or(mask, ds.month == month)

    climatologyMonths = ds.where(mask, drop=True)

    climatology = _compute_masked_mean(climatologyMonths, maskVaries)

    return climatology




[docs]
def add_years_months_days_in_month(ds, calendar=None):
    """
    Add ``year``, ``month`` and ``daysInMonth`` as data arrays in ``ds``.
    The number of days in each month of ``ds`` is computed either using the
    ``startTime`` and ``endTime`` if available or assuming ``noleap``
    calendar and ignoring leap years.  ``year`` and ``month`` are computed
    accounting correctly for the the calendar.

    Parameters
    ----------
    ds : ``xarray.Dataset`` or ``xarray.DataArray`` object
        A data set with a ``Time`` coordinate expressed as days since
        0001-01-01

    calendar : {'gregorian', 'noleap'}, optional
        The name of one of the calendars supported by MPAS cores, used to
        determine ``year`` and ``month`` from ``Time`` coordinate

    Returns
    -------
    ds : object of same type as ``ds``
        The data set with ``year``, ``month`` and ``daysInMonth`` data arrays
        added (if not already present)
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    if ('year' in ds.coords and 'month' in ds.coords and
            'daysInMonth' in ds.coords):
        return ds

    ds = ds.copy()

    if 'year' not in ds.coords or 'month' not in ds.coords:
        if calendar is None:
            raise ValueError('calendar must be provided if month and year '
                             'coordinate is not in ds.')
        datetimes = days_to_datetime(ds.Time, calendar=calendar)

    if 'year' not in ds.coords:
        ds.coords['year'] = ('Time', [date.year for date in datetimes])

    if 'month' not in ds.coords:
        ds.coords['month'] = ('Time', [date.month for date in datetimes])

    if 'daysInMonth' not in ds.coords:
        if 'startTime' in ds.coords and 'endTime' in ds.coords:
            ds.coords['daysInMonth'] = ds.endTime - ds.startTime
        else:
            if calendar == 'gregorian':
                print('Warning: The MPAS run used the Gregorian calendar '
                      'but does not appear to have\n'
                      'supplied start and end times.  Climatologies '
                      'will be computed with\n'
                      'month durations ignoring leap years.')

            daysInMonth = numpy.array(
                [constants.daysInMonth[int(month) - 1] for
                 month in ds.month.values], float)
            ds.coords['daysInMonth'] = ('Time', daysInMonth)

    return ds



def remap_and_write_climatology(config, climatologyDataSet,
                                climatologyFileName, remappedFileName,
                                remapper, logger=None):
    """
    Given a field in a climatology data set, use the ``remapper`` to remap
    horizontal dimensions of all fields, write the results to an output file,
    and return the remapped data set.

    Note that ``climatologyFileName`` and ``remappedFileName`` will be
    overwritten if they exist, so if this behavior is not desired, the calling
    code should skip this call if the files exist and simply load the contents
    of ``remappedFileName``.

    Parameters
    ----------
    config : tranche.Tranche
        Contains configuration options

    climatologyDataSet : ``xarray.DataSet`` or ``xarray.DataArray`` object
        A data set containing a climatology

    fieldName : str
        A field within the climatology to be remapped

    climatologyFileName : str
        The name of the output file to which the data set should be written
        before remapping (if using ncremap).

    remappedFileName : str
        The name of the output file to which the remapped data set should
        be written.

    remapper : ``pyremap.Remapper`` object
        A remapper that can be used to remap files or data sets to a
        comparison grid.

    logger : ``logging.Logger``, optional
        A logger to which ncclimo output should be redirected

    Returns
    -------
    remappedClimatology : ``xarray.DataSet`` or ``xarray.DataArray`` object
        A data set containing the remapped climatology
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    useNcremap = config.getboolean('climatology', 'useNcremap')

    if remapper.map_filename is None:
        # no remapping is needed
        remappedClimatology = climatologyDataSet
    else:
        renormalizationThreshold = config.getfloat(
            'climatology', 'renormalizationThreshold')
        parallel_exec = config.get(
            'execute', 'ncremapParallelExec')
        if parallel_exec == 'None':
            parallel_exec = None

        if useNcremap:
            if not os.path.exists(climatologyFileName):
                write_netcdf(climatologyDataSet, climatologyFileName)
            remapper.ncremap(
                in_filename=climatologyFileName,
                out_filename=remappedFileName,
                overwrite=True,
                renormalize=renormalizationThreshold,
                logger=logger,
                parallel_exec=parallel_exec)
            remappedClimatology = xr.open_dataset(remappedFileName)
        else:

            remappedClimatology = remapper.remap_numpy(
                climatologyDataSet, renormalizationThreshold)
            write_netcdf_with_fill(remappedClimatology, remappedFileName)
    return remappedClimatology



[docs]
def get_unmasked_mpas_climatology_directory(config, op='avg'):
    """
    Get the directory for an unmasked MPAS climatology produced by ncclimo,
    making the directory if it doesn't already exist

    Parameters
    ----------
    config : tranche.Tranche
        configuration options

    op : {'avg', 'min', 'max'}
         operator for monthly stats
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    climatologyOpDirectory = get_climatology_op_directory(config, op)

    mpasMeshName = config.get('input', 'mpasMeshName')

    directory = '{}/unmasked_{}'.format(climatologyOpDirectory,
                                        mpasMeshName)

    make_directories(directory)
    return directory




[docs]
def get_unmasked_mpas_climatology_file_name(config, season, componentName,
                                            op='avg'):
    """
    Get the file name for an unmasked MPAS climatology produced by ncclimo

    Parameters
    ----------
    config : tranche.Tranche
        configuration options

    season : str
        One of the seasons in ``constants.monthDictionary``

    componentName : {'ocean', 'seaIce'}
        The MPAS component for which the climatology is being computed

    op : {'avg', 'min', 'max'}
         operator for monthly stats
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    startYear = config.getint('climatology', 'startYear')
    endYear = config.getint('climatology', 'endYear')

    if componentName == 'ocean':
        ncclimoModel = 'mpaso'
    elif componentName == 'seaIce':
        ncclimoModel = 'mpascice'
    else:
        raise ValueError('component {} is not supported by ncclimo.\n'
                         'Check with Charlie Zender and Xylar Asay-Davis\n'
                         'about getting it added'.format(componentName))

    directory = get_unmasked_mpas_climatology_directory(config, op)

    make_directories(directory)
    monthValues = sorted(constants.monthDictionary[season])
    startMonth = monthValues[0]
    endMonth = monthValues[-1]

    suffix = '{:04d}{:02d}_{:04d}{:02d}_climo'.format(
        startYear, startMonth, endYear, endMonth)

    if season in constants.abrevMonthNames:
        season = '{:02d}'.format(monthValues[0])
    fileName = '{}/{}_{}_{}.nc'.format(directory, ncclimoModel,
                                       season, suffix)
    return fileName




[docs]
def get_masked_mpas_climatology_file_name(config, season, componentName,
                                          climatologyName, op='avg'):
    """
    Get the file name for a masked MPAS climatology

    Parameters
    ----------
    config : tranche.Tranche
        Configuration options

    season : str
        One of the seasons in ``constants.monthDictionary``

    componentName : {'ocean', 'seaIce'}
        The MPAS component for which the climatology is being computed

    climatologyName : str
        The name of the climatology (typically the name of a field to mask
        and later remap)

    op : {'avg', 'min', 'max'}
         operator for monthly stats
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    startYear = config.getint('climatology', 'startYear')
    endYear = config.getint('climatology', 'endYear')
    mpasMeshName = config.get('input', 'mpasMeshName')

    if componentName == 'ocean':
        ncclimoModel = 'mpaso'
    elif componentName == 'seaIce':
        ncclimoModel = 'mpascice'
    else:
        raise ValueError('component {} is not supported by ncclimo.\n'
                         'Check with Charlie Zender and Xylar Asay-Davis\n'
                         'about getting it added'.format(componentName))

    climatologyOpDirectory = get_climatology_op_directory(config, op)

    stageDirectory = '{}/masked'.format(climatologyOpDirectory)

    directory = '{}/{}_{}'.format(
        stageDirectory, climatologyName,
        mpasMeshName)

    make_directories(directory)

    monthValues = sorted(constants.monthDictionary[season])
    startMonth = monthValues[0]
    endMonth = monthValues[-1]

    suffix = '{:04d}{:02d}_{:04d}{:02d}_climo'.format(
        startYear, startMonth, endYear, endMonth)

    if season in constants.abrevMonthNames:
        season = '{:02d}'.format(monthValues[0])
    fileName = '{}/{}_{}_{}.nc'.format(
        directory, ncclimoModel, season, suffix)

    return fileName




[docs]
def get_remapped_mpas_climatology_file_name(config, season, componentName,
                                            climatologyName,
                                            comparisonGridName,
                                            op='avg'):
    """
    Get the file name for a masked MPAS climatology

    Parameters
    ----------
    config : tranche.Tranche
        Configuration options

    season : str
        One of the seasons in ``constants.monthDictionary``

    componentName : {'ocean', 'seaIce'}
        The MPAS component for which the climatology is being computed

    climatologyName : str
        The name of the climatology (typically the name of a field to mask
        and later remap)

    comparisonGridName : str
        The name of the comparison grid to use for remapping.  If it is one
        of the known comparison grid names, the full grid name is looked up via
        :py:func:`mpas_analysis.shared.climatology.get_comparison_descriptor()`

    op : {'avg', 'min', 'max'}
         operator for monthly stats
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    startYear = config.getint('climatology', 'startYear')
    endYear = config.getint('climatology', 'endYear')
    mpasMeshName = config.get('input', 'mpasMeshName')

    if componentName == 'ocean':
        ncclimoModel = 'mpaso'
    elif componentName == 'seaIce':
        ncclimoModel = 'mpascice'
    else:
        raise ValueError('component {} is not supported by ncclimo.\n'
                         'Check with Charlie Zender and Xylar Asay-Davis\n'
                         'about getting it added'.format(componentName))

    climatologyOpDirectory = get_climatology_op_directory(config, op)

    if comparisonGridName in known_comparison_grids:
        comparisonDescriptor = get_comparison_descriptor(config,
                                                         comparisonGridName)
        comparisonFullMeshName = comparisonDescriptor.mesh_name
    else:
        comparisonFullMeshName = comparisonGridName.replace(' ', '_')

    stageDirectory = '{}/remapped'.format(climatologyOpDirectory)

    directory = '{}/{}_{}_to_{}'.format(stageDirectory, climatologyName,
                                        mpasMeshName, comparisonFullMeshName)

    make_directories(directory)

    monthValues = sorted(constants.monthDictionary[season])
    startMonth = monthValues[0]
    endMonth = monthValues[-1]

    suffix = '{:04d}{:02d}_{:04d}{:02d}_climo'.format(
        startYear, startMonth, endYear, endMonth)

    if season in constants.abrevMonthNames:
        season = '{:02d}'.format(monthValues[0])
    fileName = '{}/{}_{}_{}.nc'.format(
        directory, ncclimoModel, season, suffix)

    return fileName



def get_climatology_op_directory(config, op='avg'):
    """
    Get the output directory for MPAS climatologies from output with the given
    monthly operator: avg, min or max
    """
    climatologyBaseDirectory = build_config_full_path(
        config, 'output', 'mpasClimatologySubdirectory')

    return '{}/{}'.format(climatologyBaseDirectory, op)


def _compute_masked_mean(ds, maskVaries):
    """
    Compute the time average of data set, masked out where the variables in ds
    are NaN and, if ``maskVaries == True``, weighting by the number of days
    used to compute each monthly mean time in ds.
    """

    # Authors
    # -------
    # Xylar Asay-Davis

    def ds_to_weights(ds):
        # make an identical data set to ds but replacing all data arrays with
        # nonnull applied to that data array
        weights = ds.copy(deep=True)
        if isinstance(ds, xr.core.dataarray.DataArray):
            weights = ds.notnull()
        elif isinstance(ds, xr.core.dataset.Dataset):
            for var in ds.data_vars:
                weights[var] = ds[var].notnull()
        else:
            raise TypeError('ds must be an instance of either xarray.Dataset '
                            'or xarray.DataArray.')

        return weights

    if maskVaries:
        dsWeightedSum = (ds * ds.daysInMonth).sum(dim='Time', keep_attrs=True)

        weights = ds_to_weights(ds)

        weightSum = (weights * ds.daysInMonth).sum(dim='Time')

        timeMean = dsWeightedSum / weightSum.where(weightSum > 0.)
    else:
        days = ds.daysInMonth.sum(dim='Time')

        dsWeightedSum = (ds * ds.daysInMonth).sum(dim='Time', keep_attrs=True)

        timeMean = dsWeightedSum / days.where(days > 0.)

    return timeMean


def _matches_comparison(obsDescriptor, comparisonDescriptor):
    """
    Determine if the two meshes are the same
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    if isinstance(obsDescriptor, ProjectionGridDescriptor) and \
            isinstance(comparisonDescriptor, ProjectionGridDescriptor):
        # pretty hard to determine if projections are the same, so we'll rely
        # on the grid names
        match = obsDescriptor.mesh_name == comparisonDescriptor.mesh_name and \
                len(obsDescriptor.x) == len(comparisonDescriptor.x) and \
                len(obsDescriptor.y) == len(comparisonDescriptor.y) and \
                numpy.all(numpy.isclose(obsDescriptor.x,
                                        comparisonDescriptor.x)) and \
                numpy.all(numpy.isclose(obsDescriptor.y,
                                        comparisonDescriptor.y))
    elif isinstance(obsDescriptor, LatLonGridDescriptor) and \
            isinstance(comparisonDescriptor, LatLonGridDescriptor):
        match = ((('degree' in obsDescriptor.units and
                   'degree' in comparisonDescriptor.units) or
                  ('radian' in obsDescriptor.units and
                   'radian' in comparisonDescriptor.units)) and
                 len(obsDescriptor.lat) == len(comparisonDescriptor.lat) and
                 len(obsDescriptor.lon) == len(comparisonDescriptor.lon) and
                 numpy.all(numpy.isclose(obsDescriptor.lat,
                                         comparisonDescriptor.lat)) and
                 numpy.all(numpy.isclose(obsDescriptor.lon,
                                         comparisonDescriptor.lon)))
    else:
        match = False

    return match


def _setup_climatology_caching(ds, startYearClimo, endYearClimo,
                               yearsPerCacheFile, cachePrefix,
                               monthValues):
    """
    Determine which cache files already exist, which are incomplete and which
    years are present in each cache file (whether existing or to be created).
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    cacheInfo = []

    cacheIndices = -1 * numpy.ones(ds.sizes['Time'], int)
    monthsInDs = ds.month.values
    yearsInDs = ds.year.values

    # figure out which files to load and which years go in each file
    for firstYear in range(startYearClimo, endYearClimo + 1,
                           yearsPerCacheFile):
        years = range(firstYear, firstYear + yearsPerCacheFile)

        yearString, fileSuffix = _get_year_string(years[0], years[-1])
        outputFileClimo = '{}_{}.nc'.format(cachePrefix, fileSuffix)

        done = False
        if os.path.exists(outputFileClimo):
            # already cached
            dsCached = None
            try:
                dsCached = xr.open_dataset(outputFileClimo)
            except IOError:
                # assuming the cache file is corrupt, so deleting it.
                print('Warning: Deleting cache file {}, which appears to '
                      'have been corrupted.'.format(outputFileClimo))

                os.remove(outputFileClimo)

            monthsIfDone = len(monthValues) * len(years)
            if ((dsCached is not None) and
                    (dsCached.attrs['totalMonths'] == monthsIfDone)):
                # also complete, so we can move on
                done = True
            if dsCached is not None:
                dsCached.close()

        cacheIndex = len(cacheInfo)
        for year in years:
            for month in monthValues:
                mask = numpy.logical_and(yearsInDs == year,
                                         monthsInDs == month)
                cacheIndices[mask] = cacheIndex

        if numpy.count_nonzero(cacheIndices == cacheIndex) == 0:
            continue

        cacheInfo.append((outputFileClimo, done, yearString))

    ds = ds.copy()
    ds.coords['cacheIndices'] = ('Time', cacheIndices)

    return cacheInfo, cacheIndices


def _cache_individual_climatologies(ds, cacheInfo, printProgress,
                                    yearsPerCacheFile, monthValues,
                                    calendar):
    """
    Cache individual climatologies for later aggregation.
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    for cacheIndex, info in enumerate(cacheInfo):
        outputFileClimo, done, yearString = info
        if done:
            continue
        dsYear = ds.where(ds.cacheIndices == cacheIndex, drop=True)

        if printProgress:
            print('     {}'.format(yearString))

        totalDays = dsYear.daysInMonth.sum(dim='Time').values

        monthCount = dsYear.sizes['Time']

        climatology = compute_climatology(dsYear, monthValues, calendar,
                                          maskVaries=False)

        climatology.attrs['totalDays'] = totalDays
        climatology.attrs['totalMonths'] = monthCount
        climatology.attrs['fingerprintClimo'] = fingerprint_generator()

        write_netcdf_with_fill(climatology, outputFileClimo)
        climatology.close()


def _cache_aggregated_climatology(startYearClimo, endYearClimo, cachePrefix,
                                  printProgress, monthValues,
                                  cacheInfo):
    """
    Cache aggregated climatology from individual climatologies.
    """
    # Authors
    # -------
    # Xylar Asay-Davis

    yearString, fileSuffix = _get_year_string(startYearClimo, endYearClimo)
    outputFileClimo = '{}_{}.nc'.format(cachePrefix, fileSuffix)

    done = False
    if len(cacheInfo) == 0:
        climatology = None
        done = True

    if os.path.exists(outputFileClimo):
        # already cached
        climatology = None
        try:
            climatology = xr.open_dataset(outputFileClimo)

        except IOError:
            # assuming the cache file is corrupt, so deleting it.
            print('Warning: Deleting cache file {}, which appears to have '
                  'been corrupted.'.format(outputFileClimo))
            os.remove(outputFileClimo)

        if len(cacheInfo) == 1 and outputFileClimo == cacheInfo[0][0]:
            # theres only one cache file and it already has the same name
            # as the aggregated file so no need to aggregate
            done = True

        elif climatology is not None:
            monthsIfDone = ((endYearClimo - startYearClimo + 1)
                            * len(monthValues))
            if climatology.attrs['totalMonths'] == monthsIfDone:
                # also complete, so we can move on
                done = True
            else:
                climatology.close()

    if not done:
        if printProgress:
            print('   Computing aggregated climatology '
                  '{}...'.format(yearString))

        first = True
        for cacheIndex, info in enumerate(cacheInfo):
            inFileClimo = info[0]
            ds = xr.open_dataset(inFileClimo)
            days = ds.attrs['totalDays']
            months = ds.attrs['totalMonths']
            if first:
                totalDays = days
                totalMonths = months
                climatology = ds * days
                first = False
            else:
                totalDays += days
                totalMonths += months
                climatology = climatology + ds * days

            ds.close()
        climatology = climatology / totalDays

        climatology.attrs['totalDays'] = totalDays
        climatology.attrs['totalMonths'] = totalMonths
        climatology.attrs['fingerprintClimo'] = fingerprint_generator()

        write_netcdf_with_fill(climatology, outputFileClimo)

    return climatology


def _get_year_string(startYear, endYear):
    if startYear == endYear:
        yearString = '{:04d}'.format(startYear)
        fileSuffix = 'year{}'.format(yearString)
    else:
        yearString = '{:04d}-{:04d}'.format(startYear, endYear)
        fileSuffix = 'years{}'.format(yearString)

    return yearString, fileSuffix