#!/usr/bin/env python
"""Support for ISTP-compliant CDFs
The `ISTP metadata standard <https://spdf.gsfc.nasa.gov/sp_use_of_cdf.html>`_
specifies the interpretation of the attributes in a CDF to describe
relationships between the variables and their physical interpretation.
This module supports that subset of CDFs.
Authors: Jon Niehof
Additional Contributors: Lorna Ellis, Asher Merrill
Institution: University of New Hampshire
Contact: Jonathan.Niehof@unh.edu
"""
import collections
import datetime
import functools
import inspect
import itertools
import math
import os.path
import re
import numpy
import spacepy.datamodel
import spacepy.pycdf
import spacepy.pycdf.const
[docs]
class VariableChecks(object):
"""ISTP compliance checks for a single variable.
Checks a variable's compliance with ISTP standards. This mostly
performs checks that are not currently performed by the `ISTP
skeleton editor <https://spdf.gsfc.nasa.gov/skteditor/>`_. All
tests return a list, one error string for every noncompliance
found (empty list if compliant). `all` will perform all
tests and concatenate all errors.
"""
#When adding new tests, add to list above
#Validation failures should be formatted as a sentence (initial cap,
#closing period) and NOT include the variable name.
[docs]
@classmethod
def all(cls, v, catch=False):
"""Perform all variable tests
Parameters
----------
v : `~.pycdf.Var`
Variable to check
catch : bool
Catch exceptions in tests (default False). If True, any
exceptions in subtests will result in an addition to the
validation failures of the form "Test x did not complete."
Calling the individual test will reveal the full traceback.
Returns
-------
list of str
Description of each validation failure.
Examples
--------
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> f = spacepy.pycdf.CDF('foo.cdf', create=True)
>>> v = f.new('Var', data=[1, 2, 3])
>>> spacepy.pycdf.istp.VariableChecks.all(v)
['No FIELDNAM attribute.']
"""
callme = [func for name, func in inspect.getmembers(cls)
if not name.startswith('_') and not name.endswith('_')
and callable(func) and name != 'all']
errors = []
for f in callme:
try:
errors.extend(f(v))
except:
if catch:
errors.append('Test {} did not complete.'.format(
f.__name__))
else:
raise
return errors
[docs]
@classmethod
def depends(cls, v):
"""Checks that DELTA, DEPEND, and LABL_PTR variables exist
Check that variables specified in the variable attributes for
`DELTA
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#DELTA>`_,
`DEPEND
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#DEPEND_0>`_,
and `LABL_PTR
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#LABL_PTR_1>`_
exist in the CDF.
Parameters
----------
v : `~.pycdf.Var`
Variable to check
Returns
-------
list of str
Description of each validation failure.
"""
return ['{} variable {} missing.'.format(a, v.attrs[a])
for a in v.attrs
if (a.startswith(('DEPEND_', 'LABL_PTR_',))
or a in ('DELTA_PLUS_VAR', 'DELTA_MINUS_VAR'))
and not v.attrs[a] in v.cdf_file]
[docs]
@classmethod
def deltas(cls, v):
"""Check DELTA variables
Check that variables specified in the variable attributes for
`DELTA
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#DELTA>`_
match the type, size, and units of this variable.
Parameters
----------
v : `~.pycdf.Var`
Variable to check
Returns
-------
list of str
Description of each validation failure.
"""
errs = []
if v.rv():
shape = v.shape[1:]
n_recs = len(v)
else:
shape = v.shape
n_recs = None
for delta in ('DELTA_PLUS_VAR', 'DELTA_MINUS_VAR'):
if not delta in v.attrs:
continue
deltavar = v.cdf_file[v.attrs[delta]]
if deltavar.type() != v.type():
errs.append(
'{} type {} does not match variable type {}.'.format(
delta, spacepy.pycdf.lib.cdftypenames[deltavar.type()],
spacepy.pycdf.lib.cdftypenames[v.type()]))
if deltavar.attrs.get('UNITS', None) != v.attrs.get('UNITS', None):
errs.append('{} units do not match variable units.'.format(
delta))
if deltavar.rv():
dshape = deltavar.shape[1:]
d_n_recs = len(deltavar)
else:
dshape = deltavar.shape
d_n_recs = None
if dshape != shape:
errs.append(
'{} shape {} does not match variable shape {}.'.format(
delta, dshape, shape))
if d_n_recs is not None and n_recs is not None \
and d_n_recs != n_recs:
errs.append((
'{} record count {} does not match variable record'
' count {}.').format(
delta, d_n_recs, n_recs))
return errs
[docs]
@classmethod
def depsize(cls, v):
"""Checks that DEPEND has same shape as that dim
Compares the size of variables specified in the variable
attributes for `DEPEND
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#DEPEND_0>`_
and compares to the size of the corresponding dimension in
this variable.
Parameters
----------
v : `~.pycdf.Var`
Variable to check
Returns
-------
list of str
Description of each validation failure.
"""
rv = int(v.rv()) #RV is a leading dimension
errs = []
# Check that don't have invalid DEPEND_1
if v.shape == (0,):
if 'DEPEND_1' in v.attrs or 'DEPEND_2' in v.attrs:
errs.append('Do not expect DEPEND_1 or DEPEND_2 in 1 dimensional variable.')
for i in range(rv, len(v.shape)): #This is index on shape (of var)
depidx = i + 1 - rv #This is x in DEPEND_x
target = v.shape[i]
if not 'DEPEND_{}'.format(depidx) in v.attrs:
continue
d = v.attrs['DEPEND_{}'.format(depidx)]
if d in v.cdf_file:
dv = v.cdf_file[d]
else:
continue #this is a different error
if dv.rv() != ('DEPEND_0' in dv.attrs):
errs.append('DEPEND_{} {} is RV but has no DEPEND_0.'
.format(depidx, d))
continue
#We hope the only weirdness is whether the dependency
#is constant, or dependent on record. If it's dependent
#on another dependency, this gets really weird really fast
# If the dependency is dependent, remove the lower level
# dependency size from consideration
# eg. if counts [80,48], depends on energy [80,48],
# depends on look [80], remove 80 from the view of energy
# so that we accurately check 48==48.
# NB: This assumes max of two layers of dependency
if 'DEPEND_2' in dv.attrs:
errs.append('Do not expect three layers of dependency.')
continue
elif 'DEPEND_1' in dv.attrs:
dd = dv.attrs['DEPEND_1']
if dd in v.cdf_file:
ddv = v.cdf_file[dd]
else:
continue #this is a different error
actual = list(dv.shape)
for ii in actual:
if ii in ddv.shape:
actual.remove(ii)
if 'DEPEND_0' in dv.attrs:
# record varying
dd = dv.attrs['DEPEND_0']
if dd[:5] != 'Epoch':
errs.append('Expect DEPEND_0 to be Epoch.')
continue
if dd in v.cdf_file:
ddv = v.cdf_file[dd]
else:
continue #this is a different error
for ii in actual:
if ii in ddv.shape:
actual.remove(ii)
if len(actual) != 1:
errs.append('More complicated double dependency than taken into account.')
continue
else:
actual = actual[0]
else:
actual = dv.shape[int(dv.rv())]
if target != actual:
errs.append('Dim {} sized {} but DEPEND_{} {} sized {}.'.format(
i, target, depidx, d, actual))
return errs
[docs]
@classmethod
def empty_entry(cls, v):
"""Check for attributes with empty string
Checks attributes for this variable for any entries consisting
of an empty string. These should be replaced with a single space.
Parameters
----------
v : `~.pycdf.Var`
Variable to check
Returns
-------
list of str
Description of each validation failure.
"""
errs = []
for a in v.attrs:
if v.attrs.type(a) in (spacepy.pycdf.const.CDF_CHAR.value,
spacepy.pycdf.const.CDF_UCHAR.value) \
and v.attrs[a] == '':
errs.append('Empty CHAR entry for attribute {}.'.format(a))
return errs
[docs]
@classmethod
def fillval(cls, v):
"""Check for FILLVAL presence, type, value
Checks variable for existence of `FILLVAL
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#FILLVAL>`_
attribute and makes sure it is the same type as variable and matches
ISTP value.
Parameters
----------
v : `~.pycdf.Var`
Variable to check
Returns
-------
list of str
Description of each validation failure.
See Also
--------
spacepy.pycdf.istp.fillval : Automatic setting of this value.
"""
errs = []
if not 'FILLVAL' in v.attrs:
return ['No FILLVAL attribute.']
if v.attrs.type('FILLVAL') != v.type():
errs.append(
'FILLVAL type {} does not match variable type {}.'.format(
spacepy.pycdf.lib.cdftypenames[v.attrs.type('FILLVAL')],
spacepy.pycdf.lib.cdftypenames[v.type()]))
expected = fillval(v, ret=True)
timetype = v.type() in spacepy.pycdf.lib.timetypes
actual = (v.cdf_file.raw_var(v.name()) if timetype else v)\
.attrs['FILLVAL']
match = numpy.isclose(
actual, expected, atol=0, rtol=1e-7)\
if numpy.issubdtype(v.dtype, numpy.floating)\
else numpy.all(actual == expected)
if not match:
if timetype:
if v.type() == spacepy.pycdf.const.CDF_EPOCH16.value:
converted_expected = spacepy.pycdf.lib.v_epoch16_to_datetime(
numpy.asanyarray(expected))
else:
converted_expected = {
spacepy.pycdf.const.CDF_EPOCH.value:
spacepy.pycdf.lib.v_epoch_to_datetime,
spacepy.pycdf.const.CDF_TIME_TT2000.value:
spacepy.pycdf.lib.v_tt2000_to_datetime
}[v.type()](expected)
errs.append(
'FILLVAL {} ({}), should be {} ({}) for variable type {}.'
.format(
actual,
v.attrs['FILLVAL'],
expected,
converted_expected,
spacepy.pycdf.lib.cdftypenames[v.type()]))
else:
errs.append(
'FILLVAL {}, should be {} for variable type {}.'.format(
actual, expected,
spacepy.pycdf.lib.cdftypenames[v.type()]))
return errs
[docs]
@classmethod
def recordcount(cls, v):
"""Check that the DEPEND_0 has same record count as variable
Checks the record count of the variable specified in the
variable attribute for `DEPEND_0
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#DEPEND_0>`_
and compares to the record count for this variable.
Parameters
----------
v : `~.pycdf.Var`
Variable to check
Returns
-------
list of str
Description of each validation failure.
"""
if not v.rv() or not 'DEPEND_0' in v.attrs:
return []
dep0 = v.attrs['DEPEND_0']
if not dep0 in v.cdf_file: #This is a DIFFERENT error
return []
if len(v) != len(v.cdf_file[dep0]):
return ['{} records; DEPEND_0 {} has {}.'.format(
len(v), dep0, len(v.cdf_file[dep0]))]
return []
@classmethod
def _validhelper(cls, v, rng=True):
"""Helper function for checking SCALEMIN/MAX, VALIDMIN/MAX
Parameters
----------
v : `~.pycdf.Var`
Variable to check
rng : bool
Do range check (True, default) or scale check (False)
Returns
-------
list of str
Description of each validation failure.
"""
validscale = 'VALID' if rng else 'SCALE'
whichmin, whichmax = ('VALIDMIN', 'VALIDMAX') if rng \
else ('SCALEMIN', 'SCALEMAX')
errs = []
vshape = v.shape
minval, maxval = spacepy.pycdf.lib.get_minmax(v.type())
if rng:
data = v[...]
is_fill = False
if 'FILLVAL' in v.attrs:
filldtype = spacepy.pycdf.lib.numpytypedict.get(
v.attrs.type('FILLVAL'), object)
if numpy.issubdtype(v.dtype, numpy.floating) \
and numpy.issubdtype(filldtype, numpy.floating):
is_fill = numpy.isclose(data, v.attrs['FILLVAL'])
elif numpy.can_cast(numpy.asanyarray(v.attrs['FILLVAL']),
v.dtype):
is_fill = data == v.attrs['FILLVAL']
for which in (whichmin, whichmax):
if not which in v.attrs:
continue
atype = v.attrs.type(which)
vtype = v.type()
if atype != vtype:
errs.append(
'{} type {} does not match variable type {}.'.format(
which,
spacepy.pycdf.lib.cdftypenames[atype],
spacepy.pycdf.lib.cdftypenames[vtype]))
attrval = v.attrs[which]
multidim = bool(numpy.shape(attrval)) #multi-dimensional
if multidim: #Compare shapes, require only 1D var
#Match attribute dim to first non-record var dim
firstdim = int(v.rv())
if vshape[firstdim] != numpy.shape(attrval)[0]:
errs.append(('{} element count {} does not match first data'
' dimension size {}.').format(
which, numpy.shape(attrval)[0],
v.shape[firstdim]))
continue
if len(vshape) != firstdim + 1: #only one non-record dim
errs.append('Multi-element {} only valid with 1D variable.'
.format(which))
continue
if firstdim: #Add pseudo-record dim
attrval = numpy.reshape(attrval, (1, -1))
# min, max, variable data all same dtype
if not numpy.can_cast(numpy.asanyarray(attrval),
numpy.asanyarray(minval).dtype) or \
(atype in spacepy.pycdf.lib.timetypes) != (vtype in spacepy.pycdf.lib.timetypes):
errs.append(
'{} type {} not comparable to variable type {}.'.format(
which,
spacepy.pycdf.lib.cdftypenames[atype],
spacepy.pycdf.lib.cdftypenames[vtype]
))
continue # Cannot do comparisons
if numpy.any((minval > attrval)) or numpy.any((maxval < attrval)):
errs.append('{} ({}) outside valid data range ({},{}).'.format(
which, attrval[0, :] if multidim else attrval,
minval, maxval))
if not rng or not len(v): #nothing to compare
continue
#Always put numpy array on the left so knows to do element compare
idx = (data < attrval) if which == whichmin \
else (data > attrval)
idx = numpy.logical_and(idx, numpy.logical_not(is_fill))
if idx.any():
direction = 'under' if which == whichmin else 'over'
if len(vshape) == 0: #Scalar
errs.append('Value {} {} {} {}.'.format(
data, direction, which,
attrval[0, :] if multidim else attrval))
continue
badidx = numpy.nonzero(idx)
badvals = data[badidx]
if len(badidx) > 1: #Multi-dimensional data
badidx = numpy.transpose(badidx) #Group by value not axis
else:
badidx = badidx[0] #Just recover the index value
if len(badvals) < 10:
badvalstr = ', '.join(str(d) for d in badvals)
badidxstr = ', '.join(str(d) for d in badidx)
errs.append('Value {} at index {} {} {} {}.'.format(
badvalstr, badidxstr,
direction, which,
attrval[0, :] if multidim else attrval))
else:
errs.append('{} values {} {} {}'.format(
len(badvals), direction, which,
attrval[0, :] if multidim else attrval))
if (whichmin in v.attrs) and (whichmax in v.attrs):
if numpy.any(v.attrs[whichmin] > v.attrs[whichmax]):
errs.append('{} > {}.'.format(whichmin, whichmax))
return errs
[docs]
@classmethod
def validrange(cls, v):
"""Check that all values are within VALIDMIN/VALIDMAX, or FILLVAL
Compare all values of this variable to `VALIDMIN
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#VALIDMIN>`_
and ``VALIDMAX``; fails validation if any values are below
VALIDMIN or above ``VALIDMAX`` unless equal to `FILLVAL
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#FILLVAL>`_.
Parameters
----------
v : `~.pycdf.Var`
Variable to check
Returns
-------
list of str
Description of each validation failure.
"""
return cls._validhelper(v)
[docs]
@classmethod
def validscale(cls, v):
"""Check SCALEMIN<=SCALEMAX, and both in range for CDF datatype.
Compares `SCALEMIN
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#SCALEMIN>`_
to ``SCALEMAX`` to make sure it isn't larger and both are
within range of the variable CDF datatype.
Parameters
----------
v : `~.pycdf.Var`
Variable to check
Returns
-------
list of str
Description of each validation failure.
"""
return cls._validhelper(v, False)
[docs]
@classmethod
def validdisplaytype(cls, v):
"""Check that plottype matches dimensions.
Check `DISPLAYTYPE
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#DISPLAY_TYPE>`_
of this variable and makes sure it is reasonable for the
variable dimensions.
Parameters
----------
v : `~.pycdf.Var`
Variable to check
Returns
-------
list of str
Description of each validation failure.
"""
time_st = 'time_series'
spec_st = 'spectrogram'
errs = []
if 'DISPLAY_TYPE' in v.attrs:
if (len(v.shape) == 1) and (v.attrs['DISPLAY_TYPE'] != time_st):
errs.append('1 dim variable with {} display type.'.format(
v.attrs['DISPLAY_TYPE']))
elif (len(v.shape) > 1) and (v.attrs['DISPLAY_TYPE'] != spec_st):
errs.append('Multi dim variable with {} display type.'.format(
v.attrs['DISPLAY_TYPE']))
return errs
[docs]
@classmethod
def fieldnam(cls, v):
"""Check that FIELDNAM attribute matches variable name.
Compare `FIELDNAM
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#FIELDNAM>`_
attribute to the variable name; fail validation if they don't
match.
Parameters
----------
v : `~.pycdf.Var`
Variable to check
Returns
-------
list of str
Description of each validation failure.
"""
errs = []
vname = v.name()
if 'FIELDNAM' not in v.attrs:
errs.append('No FIELDNAM attribute.')
elif v.attrs['FIELDNAM'] != vname:
errs.append('FIELDNAM attribute {} does not match var name.'
.format(v.attrs['FIELDNAM']))
return errs
[docs]
class FileChecks(object):
"""ISTP compliance checks for a CDF file.
Checks a file's compliance with ISTP standards. This mostly
performs checks that are not currently performed by the `ISTP
skeleton editor <https://spdf.gsfc.nasa.gov/skteditor/>`_. All
tests return a list, one error string for every noncompliance
found (empty list if compliant). `all` will perform all
tests and concatenate all errors.
"""
#When adding new tests, add to list above.
#Validation failures should be formatted as a sentence (initial cap,
#closing period).
[docs]
@classmethod
def all(cls, f, catch=False):
"""Perform all variable and file-level tests
In addition to calling every test in this class, will also call
`VariableChecks.all` for every variable in the file.
Parameters
----------
f : `~.pycdf.CDF`
Open CDF file to check
catch : bool
Catch exceptions in tests (default False). If True, any
exceptions in subtests will result in an addition to the
validation failures of the form "Test x did not complete."
Calling the individual test will reveal the full traceback.
Returns
-------
list of str
Description of each validation failure.
Examples
--------
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> f = spacepy.pycdf.CDF('foo.cdf', create=True)
>>> v = f.new('Var', data=[1, 2, 3])
>>> spacepy.pycdf.istp.FileChecks.all(f)
['No Logical_source in global attrs.',
'No Logical_file_id in global attrs.',
'Cannot parse date from filename foo.cdf.',
'Var: No FIELDNAM attribute.']
"""
#Update this list when adding new test functions
callme = [func for name, func in inspect.getmembers(cls)
if not name.startswith('_') and not name.endswith('_')
and callable(func) and name != 'all']
errors = []
for func in callme:
try:
errors.extend(func(f))
except:
if catch:
errors.append('Test {} did not complete.'.format(
func.__name__))
else:
raise
for v in f:
errors.extend(('{}: {}'.format(v, e)
for e in VariableChecks.all(f[v], catch=catch)))
return errors
[docs]
@classmethod
def empty_entry(cls, f):
"""Check for attributes with empty string
Checks global attributes for this variable for any entries consisting
of an empty string. These should be replaced with a single space.
Parameters
----------
f : `~.pycdf.CDF`
Open CDF file to check
Returns
-------
list of str
Description of each validation failure.
"""
errs = []
for a in f.attrs:
attr = f.attrs[a]
for i in range(attr.max_idx() + 1):
if attr.has_entry(i) \
and attr.type(i) in (spacepy.pycdf.const.CDF_CHAR.value,
spacepy.pycdf.const.CDF_UCHAR.value) \
and attr[i] == '':
errs.append('Empty CHAR entry {} for attribute {}.'
.format(i, a))
return errs
[docs]
@classmethod
def filename(cls, f):
"""Compare filename to global attributes
Check global attribute `Logical_file_id
<https://spdf.gsfc.nasa.gov/istp_guide/gattributes.html#Logical_file_id>`_
and `Logical_source
<https://spdf.gsfc.nasa.gov/istp_guide/gattributes.html#Logical_source>`_
for consistency with CDF filename.
Parameters
----------
f : `~.pycdf.CDF`
Open CDF file to check
Returns
-------
list of str
Description of each validation failure.
"""
errs = []
for a in ('Logical_source', 'Logical_file_id'):
if not a in f.attrs or len(f.attrs[a]) == 0:
errs.append('No {} in global attrs.'.format(a))
if errs:
return errs
fname = os.path.basename(f.pathname)
fname = fname.decode('ascii')
if not fname.startswith(f.attrs['Logical_source'][0]):
errs.append("Logical_source {} doesn't match filename {}.".format(
f.attrs['Logical_source'][0], fname))
if fname[:-4] != f.attrs['Logical_file_id'][0]:
errs.append("Logical_file_id {} doesn't match filename {}.".format(
f.attrs['Logical_file_id'][0], fname))
return errs
[docs]
@classmethod
def time_monoton(cls, f):
"""Checks that times are monotonic
Check that all `Epoch
<https://spdf.gsfc.nasa.gov/istp_guide/variables.html#support_data_eg1>`_
variables are monotonically increasing.
Parameters
----------
f : `~.pycdf.CDF`
Open CDF file to check
Returns
-------
list of str
Description of each validation failure.
"""
errs = []
for v in f:
if not f[v].type() in (spacepy.pycdf.const.CDF_EPOCH.value,
spacepy.pycdf.const.CDF_EPOCH16.value,
spacepy.pycdf.const.CDF_TIME_TT2000.value):
continue
data = f[v][...]
idx = numpy.where(numpy.diff(data) < datetime.timedelta(0))[0]
if not any(idx):
continue
errs.append('{}: Nonmonotonic time at record {}.'.format(
v, ', '.join((str(i) for i in (idx + 1)))))
return errs
[docs]
@classmethod
def times(cls, f):
"""Compare filename to times
Check that all `Epoch
<https://spdf.gsfc.nasa.gov/istp_guide/variables.html#support_data_eg1>`_
variables only contain times matching filename.
Parameters
----------
f : `~.pycdf.CDF`
Open CDF file to check
Returns
-------
list of str
Description of each validation failure.
Notes
-----
This function assumes daily files and should be extended based on the
File_naming_convention global attribute (which itself is another good
check to have.)
"""
errs = []
fname = os.path.basename(f.pathname)
fname = fname.decode('ascii')
m = re.search(r'\d{8}', fname)
if not m:
return ['Cannot parse date from filename {}'.format(fname)]
datestr = m.group(0)
for v in f:
if f[v].type() in (spacepy.pycdf.const.CDF_EPOCH.value,
spacepy.pycdf.const.CDF_EPOCH16.value,
spacepy.pycdf.const.CDF_TIME_TT2000.value):
datestrs = list(set((d.strftime('%Y%m%d') for d in f[v][...])))
if len(datestrs) == 0:
continue
elif len(datestrs) > 1:
errs.append('{}: multiple days {}.'.format(
v, ', '.join(sorted(datestrs))))
elif datestrs[0] != datestr:
errs.append('{}: date {} doesn\'t match file {}.'.format(
v, datestrs[0], fname))
return errs
[docs]
def fillval(v, ret=False):
"""Set ISTP-compliant FILLVAL on a variable
Sets or returns a CDF variable's `FILLVAL
<https://spdf.gsfc.nasa.gov/istp_guide/vattributes.html#FILLVAL>`_
attribute to the value required by ISTP (based on variable type).
Parameters
----------
v : `~.pycdf.Var`
CDF variable to update
Other Parameters
----------------
ret : boolean
If True, return the value instead of setting it (Default False, set).
Returns
-------
various
If ``ret`` is True, returns the correct value for variable type (which
may be of various Python types). Otherwise sets the value and returns
``None``.
Examples
--------
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> f = spacepy.pycdf.CDF('foo.cdf', create=True)
>>> v = f.new('Var', data=[1, 2, 3])
>>> spacepy.pycdf.istp.fillval(v)
>>> v.attrs['FILLVAL']
-128
"""
#Fill value, indexed by the CDF type (numeric)
fillvals = {}
#Integers
for i in (1, 2, 4, 8):
fillvals[getattr(spacepy.pycdf.const, 'CDF_INT{}'.format(i)).value] = \
- 2 ** (8*i - 1)
if i == 8:
continue
fillvals[getattr(spacepy.pycdf.const, 'CDF_UINT{}'.format(i)).value] = \
2 ** (8*i) - 1
fillvals[spacepy.pycdf.const.CDF_EPOCH16.value] = (-1e31, -1e31)
fillvals[spacepy.pycdf.const.CDF_REAL8.value] = -1e31
fillvals[spacepy.pycdf.const.CDF_REAL4.value] = -1e31
fillvals[spacepy.pycdf.const.CDF_CHAR.value] = ' '
fillvals[spacepy.pycdf.const.CDF_UCHAR.value] = ' '
#Equivalent pairs
for cdf_t, equiv in (
(spacepy.pycdf.const.CDF_TIME_TT2000, spacepy.pycdf.const.CDF_INT8),
(spacepy.pycdf.const.CDF_EPOCH, spacepy.pycdf.const.CDF_REAL8),
(spacepy.pycdf.const.CDF_BYTE, spacepy.pycdf.const.CDF_INT1),
(spacepy.pycdf.const.CDF_FLOAT, spacepy.pycdf.const.CDF_REAL4),
(spacepy.pycdf.const.CDF_DOUBLE, spacepy.pycdf.const.CDF_REAL8),
):
fillvals[cdf_t.value] = fillvals[equiv.value]
value = fillvals[v.type()]
if ret:
return value
if 'FILLVAL' in v.attrs:
del v.attrs['FILLVAL']
v.attrs.new('FILLVAL', data=value, type=v.type())
[docs]
def nanfill(v):
"""Set fill values to NaN
Finds all values which are equal to ``FILLVAL``, greater than
``VALIDMAX``, or less than ``VALIDMIN``, and replace with ``NaN``
(not-a-number). This is an update-in-place operation; does not return
a copy.
Assumes a single value for ``VALIDMIN``, ``VALIDMAX``, ``FILLVAL``
(although if the attribute is not present, will simply assume no
restriction.)
Only applicable to floating-point types. Best applied to a
`~.pycdf.VarCopy` or `~.datamodel.dmarray`
rather than `~.pycdf.Var`. Updating a variable in a CDF
requires one write per changed value, and also will result in a CDF
that is no longer ISTP compliant.
Because of floating-point comparison, the matching to ``FILLVAL`` may
fail.
Parameters
----------
v : `~.pycdf.Var` or `~.datamodel.dmarray`
CDF variable, data, or copy to update
Examples
--------
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> f = spacepy.pycdf.CDF('foo.cdf', create=True)
>>> v = f.new('Var', data=[1, 2, 3, -1e31])
>>> spacepy.pycdf.istp.fillval(v)
>>> data = v.copy()
>>> data
VarCopy([1., 2., 3., -1.e31], dtype=float32)
>>> spacepy.pycdf.istp.nanfill(data)
>>> data
VarCopy([1., 2., 3., nan], dtype=float32)
"""
#If input is a zVar, read all the data; if not, this is a no-copy operation
indata = v[...]
badidx = numpy.zeros(shape=v.shape, dtype=bool)
if 'FILLVAL' in v.attrs:
badidx |= (indata == v.attrs['FILLVAL'][...])
if 'VALIDMIN' in v.attrs:
badidx |= (indata < v.attrs['VALIDMIN'][...])
if 'VALIDMAX' in v.attrs:
badidx |= (indata > v.attrs['VALIDMAX'][...])
#Try a simple assignment with fancy indexing
try:
v[badidx] = numpy.nan
except (IndexError, ValueError):
pass
else:
return #success
#Fancy indexing failed, do element-by-element assignment
badidx = numpy.transpose(badidx.nonzero())
for i in badidx:
v[tuple(i)] = numpy.nan
[docs]
class VarBundle(object):
"""Collective handling of ISTP-compliant variable and its dependencies.
Representation of an ISTP-compliant variable bundled together
with its dependencies to enable aggregate operations. Normally
used to copy a subset of data from one CDF or SpaceData to another by
chaining operations, or to load just the relevant data from a CDF
into a `~.datamodel.SpaceData`.
``VarBundle`` operates on a single variable within a file or SpaceData
and its various dependencies, uncertainties, labels, etc. That variable
can be specified one of two ways. An open CDF file or
SpaceData can be passed as the first parameter, and the name of a
variable within it as the second parameter. Or, for CDF files, a
:class:`~.pycdf.Var` can be passed as the only parameter, implicitly
defining the input file (the CDF containing that variable).
Unusual or indecipherable error messages may indicate an ISTP
compliance issue; see `VariableChecks` for some checks.
Parameters
----------
source : `~.pycdf.CDF`, `~.datamodel.SpaceData`, or `~.pycdf.Var`
SpaceData or open CDF containing the variable to process, or the CDF variable itself.
name : `str`
Name of the variable within ``source`` to process ("main variable").
See Also
--------
.datamodel.fromCDF
.pycdf.CDF.copy
Notes
-----
If using :class:`~.datamodel.SpaceData` input, the contents are
assumed to be `ISTP compliant
<https://spdf.gsfc.nasa.gov/sp_use_of_cdf.html>`_. In particular,
the following attributes of the enclosed
:class:`~.datamodel.dmarray` are used (*italics* denotes required):
* *DEPEND_0*, *DEPEND_1*, etc.
* LABL_PTR_0, LABL_PTR_1, etc.
* DELTA_PLUS_VAR, DELTA_MINUS_VAR
* VALIDMIN, VALIDMAX, *FILLVAL*
Examples
--------
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> #https://rbsp-ect.newmexicoconsortium.org/data_pub/rbspa/hope/level3/pitchangle/2012/
>>> infile = spacepy.pycdf.CDF('rbspa_rel04_ect-hope-PA-L3_20121201_v7.1.0.cdf')
>>> infile['FPDU']
<Var:
CDF_FLOAT [3228, 11, 72]
>
>>> infile['FPDU'].attrs
<zAttrList:
CATDESC: HOPE differential proton flux [CDF_CHAR]
DEPEND_0: Epoch_Ion [CDF_CHAR]
DEPEND_1: PITCH_ANGLE [CDF_CHAR]
DEPEND_2: HOPE_ENERGY_Ion [CDF_CHAR]
...
>
>>> b = spacepy.pycdf.istp.VarBundle(infile['FPDU'])
>>> b = spacepy.pycdf.istp.VarBundle(infile, 'FPDU') # Equivalent
>>> outfile = spacepy.pycdf.CDF('output.cdf', create=True)
>>> b.slice(1, 2, single=True).output(outfile)
<VarBundle:
FPDU: CDF_FLOAT [3228, 72]
Epoch_Ion: CDF_EPOCH [3228]
Epoch_Ion_DELTA: CDF_REAL4 [3228]
PITCH_ANGLE: CDF_FLOAT ---
Pitch_LABL: CDF_CHAR*5 ---
HOPE_ENERGY_Ion: CDF_FLOAT [3228, 72]
ENERGY_Ion_DELTA: CDF_FLOAT [3228, 72]
Energy_LABL: CDF_CHAR*3 [72] NRV
>
>>> outfile['FPDU']
<Var:
CDF_FLOAT [3228, 72]
>
>>> outfile['FPDU'].attrs
<zAttrList:
CATDESC: HOPE differential proton flux [CDF_CHAR]
DEPEND_0: Epoch_Ion [CDF_CHAR]
DEPEND_1: HOPE_ENERGY_Ion [CDF_CHAR]
...
>
>>> outfile.close()
>>> infile.close()
"""
[docs]
def __init__(self, source, name=None):
"""Initialize variable bundle
Parameters
----------
source : `~spacepy.pycdf.CDF` or `~spacepy.pycdf.Var`
CDF containing the variable to process, or the variable itself.
name : `str`
Name of the variable within ``source`` to process ("main variable").
"""
if name is None and not hasattr(source, 'cdf_file'):
raise TypeError('Single-argument form must be a variable'
' in an open CDF, not {}.'.format(type(source).__name__))
self.mainvar = source if name is None else source[name]
"""The variable to operate on."""
self.cdf = self.mainvar.cdf_file if name is None else source
"""Input CDF file containing the main variable."""
self._name = self.mainvar.name() if name is None else name
"""Name of the main variable"""
self._varinfo = {}
"""Keyed by variable name. Values are also dicts, keys are
``dims``, list of the main variable dimensions corresponding
to each dimension of the variable, ``slice``, the slice
to apply when reading this variable from the input, ``postidx``,
a numpy fancy index to apply after reading, ``thisdim``,
the main dimension for which this var is a dep
(and thus it should be removed if the dim is removed),
``vartype``, whether this variable is the main var (M),
a dependency (D), or DELTA of the main (U, for uncertainty),
``sortorder``, the order in which it should be displayed (0 for
the main variable, 1 for dependencies, 2 for all DELTAs, and 3 for
labels).
"""
self._degenerate = []
"""Index by dim, is it degenerate, i.e. removed in a slice."""
self._summed = []
"""Index by dim, is this dim summed."""
self._mean = []
"""Index by dim, is this dim averaged."""
self._getvarinfo()
def _process_delta(self, mainname, deltaname):
"""Handle DELTA_PLUS/DELTA_MINUS attributes
A DELTA variable should be the same shape and the same
dependencies as its referrer (except potentially NRV).
Parameters
----------
mainname : str
Name of variable that references the DELTA, i.e. it has a
DELTA_PLUS_VAR/DELTA_MINUS_VAR attribute that references
``deltaname``.
deltaname : str
Name of the DELTA variable itself.
Returns
-------
dict
dims/slice information suitable for inclusion in ``_varinfo``.
"""
thisvar = self.cdf[deltaname]
mainvar = self.cdf[mainname]
for a in thisvar.attrs: #Check that all dependencies match
if not a.startswith(('DEPEND_', 'LABL_PTR_')):
continue
if a in mainvar.attrs:
if thisvar.attrs[a] != mainvar.attrs[a]:
raise ValueError('{}: attribute {} mismatch with main var'
.format(deltaname, a))
elif thisvar.attrs[a] != mainname:
raise ValueError('{}: attribute {} not in main var'
.format(deltaname, a))
rv = thisvar.rv() if hasattr(thisvar, 'rv')\
else 'DEPEND_0' in thisvar.attrs
if rv and not self._varinfo[mainname]['rv']:
raise ValueError(
'{}: Cannot handle RV DELTA with NRV variable.'
.format(deltaname))
thisshape = thisvar.shape
mainshape = mainvar.shape
if not rv and self._varinfo[mainname]['rv']: #Ignore record dim
mainshape = mainshape[1:]
if thisshape != mainshape:
raise ValueError('{}: DELTA/main var shape mismatch.'
.format(deltaname))
#If this is NRV and main is RV, that's okay, the R dim will
#get removed when actually slicing.
result = { k: self._varinfo[mainname][k][:]
for k in ('dims', 'slice', 'postidx') }
result.update({
'dv': thisvar.dv() if hasattr(thisvar, 'dv')\
else [True] * (len(result['dims']) - 1),
'rv': rv,
'sortorder': 2,
})
return result
def _getvarinfo(self):
"""Find dependency and dimension information
For main variable and its dependencies, find how dimensions
relate to the main variable, and find all DELTA variables.
"""
rv = self.mainvar.rv() if hasattr(self.mainvar, 'rv')\
else 'DEPEND_0' in self.mainvar.attrs
#Every dim maps back to itself for the main variable
dims = list(range(len(self.mainvar.shape) + int(not rv)))
self._degenerate = [False] * len(self.mainvar.shape)
self._summed = [False] * len(self.mainvar.shape)
self._mean = [False] * len(self.mainvar.shape)
if not rv: #Fake the 0-dim
self._degenerate.insert(0, False)
self._summed.insert(0, False)
self._mean.insert(0, False)
#And every dimension is a full slice, to start
self._varinfo[self._name] = {
'dims': dims,
# Dim variance is CDF concept--if not specified, assume True
'dv': self.mainvar.dv() if hasattr(self.mainvar, 'dv')\
else [True] * (len(dims) - 1),
'slice': [slice(None)] * len(dims),
'postidx': [slice(None)] * len(dims),
'rv': rv,
'sortorder': 0,
'vartype': 'M',
}
mainattrs = self.mainvar.attrs
#Get the attributes that matter in the MAIN var
attrs = {a: mainattrs[a] for a in mainattrs
if a.startswith(('DEPEND_', 'LABL_PTR'))
or a in ('DELTA_PLUS_VAR', 'DELTA_MINUS_VAR')}
for a in attrs: #Process DEPEND/LABL_PTR variables
if not a.startswith(('DEPEND_', 'LABL_PTR_')):
continue
thisname = attrs[a]
if thisname in self._varinfo: #Already handled
if self._varinfo[thisname]['sortorder'] == 3 \
and a.startswith('DEPEND_'):
#Processed before as a LABL, but also is a DEPEND.
#Technically ISTP violation, but have the DEPEND take
#priority
self._varinfo[thisname]['sortorder'] = 1
continue
thisvar = self.cdf[thisname]
#Dimension of main var that corresponds to this var
dim = int(a.split('_')[-1])
dims = [0,] #Record dim always matches
rv = thisvar.rv() if hasattr(thisvar, 'rv')\
else 'DEPEND_0' in thisvar.attrs or a == 'DEPEND_0'
#For every CDF (non-record) dim, match to the main variable
for i in range(1, len(thisvar.shape) + int(not rv)):
#DEPEND; LABL_PTR for this dimension
dim_dep = 'DEPEND_{}'.format(i)
labl_dep = 'LABL_PTR_{}'.format(i)
if not dim_dep in thisvar.attrs:
#No depend on this dim, so it's the dim that's represented
#in this variable
dims.append(dim)
else: #Match to parent var
dim_dep = thisvar.attrs[dim_dep]
parentdim = next((
int(d.split('_')[-1]) for d in attrs
if (d.startswith('DEPEND_') and attrs[d] == dim_dep)
or (d.startswith('LABL_PTR_') and attrs[d] == labl_dep)
), None)
if parentdim is None:
raise ValueError('Cannot match dim {} of {}'.format(
i, thisname))
dims.append(parentdim)
if dims.count(dim) != 1:
raise ValueError('Cannot find unique dimension for {}'
.format(thisname))
self._varinfo[thisname] = {
'dims': dims,
'dv': thisvar.dv() if hasattr(thisvar, 'dv')\
else [True] * (len(dims) - 1),
'slice': [slice(None)] * len(dims),
'postidx': [slice(None)] * len(dims),
'rv': rv,
'sortorder': 1 if a.startswith('DEPEND_') else 3,
'thisdim': dim,
'vartype': 'D',
}
#Process DELTAs of the DEPENDs
for d in ('DELTA_PLUS_VAR', 'DELTA_MINUS_VAR'):
if d not in thisvar.attrs:
continue
deltaname = thisvar.attrs[d]
if deltaname in self._varinfo:
continue
self._varinfo[deltaname] \
= self._process_delta(thisname, deltaname)
self._varinfo[deltaname]['vartype'] = 'D' #just like other deps
self._varinfo[deltaname]['thisdim'] = dim
for a in ('DELTA_PLUS_VAR', 'DELTA_MINUS_VAR'): #Process DELTA vars
if not a in attrs:
continue
thisname = attrs[a]
if thisname not in self._varinfo:
#If DELTA_PLUS/DELTA_MINUS are same var, skip second one
self._varinfo[thisname] \
= self._process_delta(self._name, thisname)
self._varinfo[thisname]['vartype'] = 'U'
[docs]
def slice(self, dim, start=None, stop=None, step=None,
single=False):
"""Slice on a single dimension
Selects subset of a dimension to include in the output. Slicing
is done with reference to the dimensions of the main variable and
the corresponding dimensions of all other variables are sliced
similarly. The first non-record dimension of the variable is always
1; 0 is the record dimension (and is ignored for NRV variables).
Multiple slices can be applied to select subsets of multiple
dimensions; however, if one dimension is indexed multiple
times, only the last one in the chain takes effect.
Interpretation of the slice parameters is like normal Python slicing,
including the ability to use negative values, etc.
Passing in only a dimension "resets" the slice to include the
entire dimension.
Parameters
----------
dim : int
CDF dimension to slice on. This is the dimension as specified
in the CDF (0-base for RV variables, 1-base for NRV) and does
not change with successive slicing. Each dimension can only be
sliced once.
single : bool
Treat ``start`` as a single index and return only that index
(reducing dimensionality of the data by one.)
start : int
Index of first element of ``dim`` to include in the output.
This can also be a sequence of indices to include, in which
case ``stop`` and ``step`` must not be specified. This can be
substantially slower than specifying ``stop`` and ``step``.
stop : int
Index of first element of ``dim`` to exclude from the output.
step : int
Increment between elements to include in the output.
Returns
-------
VarBundle
This bundle, for method chaining. This is not a copy: the
original object is updated.
Examples
--------
See the `VarBundle` examples for creating output from
the slices.
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> infile = spacepy.pycdf.CDF('rbspa_rel04_ect-hope-PA-L3_20121201_v7.1.0.cdf')
>>> b = spacepy.pycdf.istp.VarBundle(infile['FPDU'])
>>> #Select index 2 from axis 1
>>> b.slice(1, 2, single=True)
>>> #Select from index 5 to end for axis 2, keeping index 2 from axis 1
>>> b.slice(2, 5)
>>> #Select 10 through 15 on axis 2, but all of axis 1
>>> b.slice(1).slice(2, 10, 15)
>>> #Select just record 5 and 10
>>> b.slice(2).slice(0, [5, 10])
>>> infile.close()
"""
if single and (self._summed[dim] or self._mean[dim]):
raise ValueError('Cannot sum/average on a single-element slice.')
self._degenerate[dim] = single
fancyidx = (stop is None and step is None and numpy.ndim(start) != 0)
sl = slice(None, None, None) if fancyidx else slice(start, stop, step)
for v in self._varinfo.values():
if not dim in v['dims']:
continue #This "main" var dimension isn't in this var
idx = v['dims'].index(dim)
#The slice to perform on read
v['slice'][idx] = start if single else sl
#And the slice to perform after the fact
if fancyidx:
v['postidx'][idx] = start
return self
[docs]
def sum(self, dim):
"""Sum across a dimension.
Total the main variable of the bundle across the given dimension.
That dimension disappears from the output and dependencies
(including their uncertainties) are assumed to be constant across
the summed dimension. The uncertainty of the main variable, if
any, is appropriately propagated (quadrature sum.)
An invalid value for any element summed over will result in a fill
value on the output. This does not work well for variables that
define multiple VALIDMIN/VALIDMAX based on position within a
dimension; the smallest VALIDMIN/largest VALIDMAX rather than the
position-specific value.
Summing occurs after slicing, to allow summing of a subset of
a dimension. A single element slice (which removes the dimension)
is incompatible with summing over that dimension.
There is not currently a way to "undo" a sum; create a new
bundle instead.
Parameters
----------
dim : int
CDF dimension to total. This is the dimension as specified
in the CDF (0-base for RV variables, 1-base for NRV) and does
not change with successive slicing or summing. This must be a
positive number (no support for e.g. -1 for last dimension.)
Returns
-------
VarBundle
This bundle, for method chaining. This is not a copy: the
original object is updated.
Examples
--------
See the `VarBundle` examples for creating output.
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> infile = spacepy.pycdf.CDF('rbspa_rel04_ect-hope-PA-L3_20121201_v7.1.0.cdf')
>>> b = spacepy.pycdf.istp.VarBundle(infile['Counts_P'])
>>> #Total over dimension 1 (pitch angle)
>>> b.sum(1)
>>> #Get a new bundle (without the previous sum)
>>> b = spacepy.pycdf.istp.VarBundle(infile['Counts_P'])
>>> #Total over first 10 elements of dimension 2 (energy bins)
>>> b.slice(2, 0, 10).sum(2)
>>> infile.close()
"""
if self._degenerate[dim]:
raise ValueError('Cannot sum on a single-element slice.')
if self._mean[dim]:
raise ValueError('Cannot sum and take mean of same dimension.')
self._summed[dim] = True
return self
[docs]
def mean(self, dim):
"""Take the mean of a dimension.
Take mean of the main variable of the bundle across the given
dimension. That dimension disappears from the output and dependencies
(including their uncertainties) are assumed to be constant across
the summed dimension. The uncertainty of the main variable, if
any, is appropriately propagated.
Invalid values are excluded fromthe mean. This does not work well
for variables that define multiple VALIDMIN/VALIDMAX based on
position within a dimension; the smallest VALIDMIN/largest VALIDMAX
rather than the position-specific value.
Averaging occurs after slicing, to allow averaging of a subset of
a dimension. A single element slice (which removes the dimension)
is incompatible with averaging over that dimension.
There is not currently a way to "undo" a mean; create a new
bundle instead.
Parameters
----------
dim : int
CDF dimension to average. This is the dimension as specified
in the CDF (0-base for RV variables, 1-base for NRV) and does
not change with successive slicing or summing. This must be a
positive number (no support for e.g. -1 for last dimension.)
Returns
-------
VarBundle
This bundle, for method chaining. This is not a copy: the
original object is updated.
Examples
--------
See the `VarBundle` examples for creating output.
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> infile = spacepy.pycdf.CDF('rbspa_rel04_ect-hope-PA-L3_20121201_v7.1.0.cdf')
>>> b = spacepy.pycdf.istp.VarBundle(infile['Counts_P'])
>>> #Average over dimension 1 (pitch angle)
>>> b.mean(1)
>>> #Get a new bundle (without the previous sum)
>>> b = spacepy.pycdf.istp.VarBundle(infile['Counts_P'])
>>> #Average over first 10 elements of dimension 2 (energy bins)
>>> b.slice(2, 0, 10).mean(2)
>>> infile.close()
"""
if self._degenerate[dim]:
raise ValueError('Cannot average on a single-element slice.')
if self._summed[dim]:
raise ValueError('Cannot sum and take mean of same dimension.')
self._mean[dim] = True
return self
def _tokeep(self):
"""Determine which variables to keep for output
Dependencies for dimensions which disappear after slicing, and
other variables that they depend on, shouldn't be included in
the output
Returns
-------
list of str
Names of variables to include in the output.
"""
#What dims of main var disappear?
deleted = [i for i in range(len(self._degenerate))
if any((self._degenerate[i], self._summed[i],
self._mean[i]))]
return [v for v, i in self._varinfo.items()
if i.get('thisdim', None) not in deleted]
def _same(self, newvar, invar, rv, dv, dims, data):
"""Checks if an existing variable matches a proposed new variable
Does not compare DEPEND and LABL_PTR attributes (those are handled
later.)
Parameters
----------
newvar : `~.pycdf.Var`
Existing variable to compare to requirements
invar : : class:`~.pycdf.Var`
Variable to use as reference for attributes, RV, CDF type,
number of elements.
rv : bool
Is the new variable record-varying
dv : list of bool
Data variance for each dimension.
dims : list of int
Size of each dimension.
data : `~numpy.ndarray`
Data that should be in the variable.
Returns
-------
bool
True if the existing variable is the same; False if not.
"""
# CDF output only checks
if hasattr(newvar, 'type'):
if newvar.rv() != rv or newvar.dv() != dv:
return False
if hasattr(invar, 'type') and newvar.type() != invar.type():
return False
if hasattr(invar, 'nelems') and newvar.nelems() != invar.nelems():
return False
#Check basic type, dimensions, etc.
if newvar.dtype != invar.dtype\
or len(dims) != (len(newvar.shape) - rv) \
or list(dims) != list(newvar.shape[rv:]):
return False
ia = invar.attrs
na = newvar.attrs
for a in ia:
if a.startswith(('DEPEND_', 'LABL_PTR_')) \
or a == 'FIELDNAM':
#depends/LABL PTR shift around, and FIELDNAM may change,
#so test outside of this function.
pass
if not a in na or not numpy.array_equal(ia[a], na[a]):
return False
# CDF input *and* output only
if hasattr(na, 'type') and hasattr(ia, 'type')\
and ia.type(a) != na.type(a):
return False
#Finally check the data
return (data == newvar[...]).all()
def _namemap(self, suffix=None):
"""Map old variable names to new
Helper for `output` that maps the variable name in the
input CDF to variable name in the output CDF.
Parameters
----------
suffix : str
String to append to name of variables that are changed
from input to output.
Returns
-------
dict
Keyed by name in input, values are name in the output. No
entry for names that don't change.
"""
namemap = {}
if suffix is not None:
for vname, vinfo in self._varinfo.items():
if vinfo['vartype'] in ('M', 'U'):
namemap[vname] = vname + suffix
else: #Dependency. If any slice/sum, it's changed
if any([any((self._summed[d], self._mean[d]))
for d in vinfo['dims']]) \
or any([s != slice(None) for s in itertools.chain(
vinfo['slice'], vinfo['postidx'])]):
namemap[vname] = vname + suffix
return namemap
def _sum_avg(self, data, invar, vinfo, degen, summed, averaged):
"""Sum/average data
Helper for `output` that performs summing and averaging
of the data for a single variable. Note dimensionality of all
input is before the removal of degenerate dimensions
(this function does the translation using ``degen``), and it is
by dimension not axis (so NRV variables have a vestigial 0th
dimension that is not interpreted.)
Parameters
----------
data : `numpy.ndarray`
Data as read from input CDF and properly sliced.
invar : `~.pycdf.Var`
CDF input variable from which ``data`` was read.
vinfo : dict
Value from instance variable ``_varinfo`` for this variable.
degen : list of bool
For each dimension of this variable, whether the dimension
is degenerate (i.e. already gone at this point.)
summed : list of bool
For each dimension of this variable, whether the dimension
should be summed over.
averaged : list of bool
For each dimension of this variable, whether the dimension
should be averaged over.
Returns
-------
`numpy.ndarray`
Data summed/averaged over dimensions according to ``summed``
and ``averaged`` inputs.
"""
#Correction for NRV variables in the mapping between dim and axis
nrv = int(not vinfo['rv'])
#Degenerate slices have already been removed, so need
#a map from old dim numbers to new ones. Note removing
#the record dimension does not shift other dims!
newdims = [None if degen[i] else i - sum(degen[1:i])
for i in range(len(degen))]
#Axis numbers to sum, with degenerate removed
#(NRV means dim 0 is axis 1, so correct for that,
#and also don't do any actions on dim 0 for NRV)
summe = [newdims[i] - nrv
for i in range(nrv, len(summed)) #old dim
if newdims[i] is not None and (summed[i] or averaged[i])]
avgme = [newdims[i] - nrv
for i in range(nrv, len(averaged)) #old dim
if newdims[i] is not None and averaged[i]]
#Sum over axes in reverse order so axis renumbering
#doesn't affect future sums
a = invar.attrs
for ax in summe[::-1]:
if vinfo['vartype'] == 'D':
#If sum over DEPEND, must be constant over axis
data = data.take(0, axis=ax)
continue
invalid = numpy.isclose(data, a['FILLVAL'])
if 'VALIDMIN' in a:
invalid = numpy.logical_or(
invalid, data < numpy.min(a['VALIDMIN']))
if 'VALIDMAX' in a:
invalid = numpy.logical_or(
invalid, data > numpy.max(a['VALIDMAX']))
data[invalid] = 0 #avoids warning and helps with mean
if vinfo['vartype'] == 'M':
data = data.sum(axis=ax)
elif vinfo['vartype'] == 'U': #propagate error
data = numpy.sqrt((data ** 2).sum(axis=ax))
else: #Should not happen
raise ValueError('Bad summation type.')
if ax in avgme: #divide out
count = numpy.sum(~invalid, axis=ax, dtype=data.dtype)
invalid = (count == 0)
count[invalid] = 1 #avoid warning
data = data / count
else: #Sum, so any fill on axis means value is fill
invalid = invalid.max(axis=ax)
data[invalid] = a['FILLVAL']
return data
def _repoint_depend(self, invar, newvar, preexist, namemap, degen):
"""Change DEPEND for new dimensionality of one variable.
Slicing/summing might change variable dimensionality and thus
the relationship with its DEPENDs, and the DEPENDs themselves
may have a new name. This updates the DEPEND attributes for
these changes, or verifies they are correct if the output
variable already exists.
Parameters
----------
invar : `~.pycdf.Var`
The input variable (opened in raw mode).
newvar : `~.pycdf.Var`
The output variable (opened in raw mode).
preexist : bool
True if ``newvar`` existed and doing a consistency check;
False if ``newvar`` was newly created and should be edited.
namemap : dict
Map from name in input variable (key) to name in output
variable (value). No entry if name didn't change.
degen : list of bool
For each dimension of this variable, whether the dimension
is degenerate (i.e. already gone at this point.) This
includes any degeneracy from summing/averaging as well as
slicing.
"""
#Index by old dim; returns the new dim (None if went away)
#Note slicing away DEPEND_0 (record dimension) does NOT change
#subsequent depends!
newdims = [None if degen[i] else i - sum(degen[1:i])
for i in range(len(degen))]
for a in list(newvar.attrs.keys()): #Editing in loop!
#Handle a suffixed DELTA if necessary
if a.startswith('DELTA_'):
olddelta = invar.attrs[a]
if isinstance(olddelta, bytes):
olddelta = olddelta.decode('ascii')
newvar.attrs[a] = namemap.get(olddelta, olddelta)
continue
if not a.startswith(('DEPEND_', 'LABL_PTR_')):
continue
newdim = int(a.split('_')[-1])
oldval = None #Sentinel value
if newdim in newdims: #An old value that belongs in this dim
olddim = newdims.index(newdim)
old_a = '{}_{}'.format('_'.join(a.split('_')[:-1]),
olddim)
oldval = invar.attrs.get(old_a, None)
if isinstance(oldval, bytes):
oldval = oldval.decode('ascii')
if oldval is not None:
#Check for variable renaming from the input to output
newval = namemap.get(oldval, oldval)
if preexist:
existingval = newvar.attrs[a]
if isinstance(existingval, bytes):
existingval = existingval.decode('ascii')
if existingval != newval:
raise RuntimeError(
'Incompatible {} already exists in output.'
.format(newvar.name()))
else:
newvar.attrs[a] = newval
else:
#Either there's no corresponding old dim, or it didn't have
#a DEPEND. Either way, shouldn't be a DEPEND in the new dim.
if preexist:
if a in newvar.attrs:
raise RuntimeError(
'Incompatible {} already exists in output.'
.format(newvar.name()))
else:
del newvar.attrs[a]
def _outshape(self, vname):
"""Calculate shape of the variable on output
Parameters
----------
vname : str
Name of the variable to check the shape of.
Returns
-------
tuple
The shape of the variable after all slicing, etc. applied, or
None of the variable is not included in output.
"""
if vname not in self._tokeep():
return None
vinfo = self._varinfo[vname]
invar = self.cdf[vname]
rv = vinfo['rv']
shape = invar.shape
sl = vinfo['slice']
postidx = vinfo['postidx']
#no dimension has BOTH a slice and a postindex, so combine
slices = [pi if s == slice(None, None, None) else s
for s, pi in zip(sl, postidx)]
#And any dim that is summed/averaged is degenerate, so
#slice with a single index to make it go away
for d in vinfo['dims']:
if self._summed[d] or self._mean[d]:
slices[d] = 0
if not rv: #Remove record dimension
slices = slices[1:]
#Make a fake array the size of the input, and slice it
return numpy.empty(shape=shape)[tuple(slices)].shape
[docs]
def variables(self):
"""Description of variable output from the bundle
Provides information describing the variables output
from the bundle
Returns
-------
list
Each element is a list-of-tuples. The list corresponds to a
dimension of the master var: first the master var itself, then the
uncertainties and labels associated with each dimension. Each
element of these sublists is then a tuple of variable name and
shape on the output (itself a tuple). If a variable isn't
included in the output (sliced away), its shape will be ``None``.
Examples
--------
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> infile = spacepy.pycdf.CDF('rbspa_rel04_ect-hope-PA-L3_20121201_v7.1.0.cdf')
>>> b = spacepy.pycdf.istp.VarBundle(infile['FPDU'])
>>> b.slice(1, 2, single=True).variables()
[[('FPDU', (100, 72))],
[('Epoch_Ion', (100,)), ('Epoch_Ion_DELTA', (100,))],
[('PITCH_ANGLE', None), ('Pitch_LABL', None)],
[('HOPE_ENERGY_Ion', (100, 72)),
('ENERGY_Ion_DELTA', (100, 72)),
('Energy_LABL', (72,))]]
"""
#List of every variable in each dimension
v_by_dim = functools.reduce(
lambda x, vname:
x[self._varinfo[vname].get('thisdim', None)].append(vname) or x,
self._varinfo.keys(), collections.defaultdict(list))
for l in v_by_dim.values():
l.sort(key=lambda x: (self._varinfo[x]['sortorder'], x))
variables = [[(v, self._outshape(v))
for v in v_by_dim.get(None, [])]]
vi = self._varinfo[self._name]
for dim in vi['dims']:
variables.append([
(v, self._outshape(v)) for v in v_by_dim.get(dim, [])])
return variables
[docs]
def operations(self):
"""Operations of this bundle
Provides information describing the operations this bundle
would perform.
Returns
-------
list
Each element is a tuple: first element is a string with
the name of the operation (i.e. method of
`VarBundle`), next is also a tuple of positional
arguments, and finally a dict of keyword arguments.
Examples
--------
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> infile = spacepy.pycdf.CDF('rbspa_rel04_ect-hope-PA-L3_20121201_v7.1.0.cdf')
>>> b = spacepy.pycdf.istp.VarBundle(infile['FPDU'])
>>> b.slice(1, 2, single=True).operations()
[('slice', (1, 2), {'single': True})]
>>> #Apply same operations to a different variable
>>> b2 = spacepy.pycdf.istp.VarBundle(infile['FEDU'])
>>> for op, args, kwargs in b2.operations():
... getattr(b2, op)(*args, **kwargs)
"""
ops = []
vi = self._varinfo[self._name]
for dim in vi['dims']:
sl = vi['slice'][dim]
postidx = vi['postidx'][dim]
if sl != slice(None, None, None): #simple slice
if isinstance(sl, slice): #slice
ops.append((
'slice',
tuple((s for s in (dim, sl.start, sl.stop, sl.step)
if s is not None)),
{}))
else: #single index
ops.append(('slice', (dim, sl), {'single': True}))
elif postidx != slice(None, None, None): #fancy index
ops.append(('slice', (dim, postidx,), {}))
for v, name in zip((self._mean, self._summed), ('mean', 'sum')):
if v[dim]:
ops.append((name, (dim,), {}))
return ops
[docs]
def output(self, output, suffix=None):
"""Output the variables as modified
Parameters
----------
output : `~.pycdf.CDF`, `~.datamodel.SpaceData`
Output container to receive the new data, may be an open CDF
file or a SpaceData.
suffix : str
Suffix to append to the name of any variables that are changed
for the output. This allows the output to contain multiple
variables derived from the same input variable. The main variable
and its DELTA variables will always have the suffix applied.
Any dependencies will have the suffix applied only if they have
changed from the input CDF (e.g. from slicing.)
Returns
-------
VarBundle
This bundle, for method chaining.
See Also
--------
toSpaceData
Examples
--------
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> infile = spacepy.pycdf.CDF('rbspa_rel04_ect-hope-PA-L3_20121201_v7.1.0.cdf')
>>> b = spacepy.pycdf.istp.VarBundle(infile['FPDU'])
>>> outfile = spacepy.pycdf.CDF('output.cdf', create=True)
>>> #Output the low energy half in one variable
>>> b.slice(2, 0, 36).output(outfile, suffix='_LoE')
>>> #And the high energy half in another variable
>>> b.slice(2, 36, 72).output(outfile, suffix='_HiE')
>>> outfile.close()
>>> infile.close()
"""
tokeep = self._tokeep()
namemap = self._namemap(suffix)
for vname in tokeep:
vinfo = self._varinfo[vname]
#Dim of main var that depends on this (None if main var or delta)
maindim = vinfo.get('thisdim', None)
#Degeneracy of dimensions in this variable's "frame"
degen = [self._degenerate[d] for d in vinfo['dims']]
#And whether the dim was summed
summed = [self._summed[d] for d in vinfo['dims']]
#And averaged
averaged = [self._mean[d] for d in vinfo['dims']]
# Raw data for CDF input *and* output only
invar = self.cdf.raw_var(vname) if hasattr(output, 'raw_var')\
and hasattr(self.cdf, 'raw_var') else self.cdf[vname]
sl = vinfo['slice'] #including 0th dim
postidx = vinfo['postidx']
#Dimension size/variance for original variable
#(0 index is CDF dimension 1)
dv = self._varinfo[vname]['dv']
rv = self._varinfo[vname]['rv'] #and record variance
#Scrub degenerate dimensions from the post-indexing
#(record is never degenerate)
postidx = [postidx[i] for i in range(len(postidx))
if not degen[i]]
#Now get the data, and sum/average it
if not rv: #Remove fake record dimension
sl = sl[1:]
postidx = postidx[1:]
#Forces array scalars, makes the rest work better
data = numpy.asanyarray(invar.__getitem__(tuple(sl)))
if postidx:
data = data[tuple(postidx)]
data = self._sum_avg(data, invar, vinfo, degen, summed, averaged)
#Summed/averaged dimensions are now also degenerate
degen = [max(v) for v in zip(degen, summed, averaged)]
#Get shape of output variable from actual data
dims = data.shape
#Raw Epoch16 have a trailing (2,)
if hasattr(invar, 'type')\
and invar.type() == spacepy.pycdf.const.CDF_EPOCH16.value:
dims = dims[:-1]
#Cut out any degenerate dimensions from DV (skipping record dim)
dv = [dv[i] for i in range(len(dv)) if not degen[i + 1]]
#Change record variance for the output if sliced away 0th
if rv and degen[0]:
rv = False
if rv: #remove record dimension from size IF output is RV
dims = dims[1:]
#Rename the variable if necessary
outname = namemap.get(vname, vname)
if outname in output:
preexist = True
newvar = output.raw_var(outname) if hasattr(output, 'raw_var')\
and hasattr(self.cdf, 'raw_var') else output[outname]
if not self._same(newvar, invar, rv, dv, dims, data):
raise RuntimeError(
'Incompatible {} already exists in output.'
.format(outname))
else:
preexist = False
if hasattr(output, 'new'):
t = invar.type() if hasattr(invar, 'type') else None
try:
compress, compress_param = invar.compress()
except (TypeError, AttributeError):
# arrays have a different "compress"
compress, compress_param = None, None
ne = invar.nelems() if hasattr(invar, 'nelems') else None
newvar = output.new(
outname, data=data,
type=t, recVary=rv,
dimVarys=dv, dims=dims,
n_elements=ne,
compress=compress, compress_param=compress_param)
newvar.attrs.clone(invar.attrs)
else:
newvar = spacepy.dmarray(data, attrs=invar.attrs.copy())
output[outname] = newvar
if vname != outname: #renamed
newvar.attrs['FIELDNAM'] = outname
self._repoint_depend(invar, newvar, preexist, namemap, degen)
return self
[docs]
def toSpaceData(self, suffix=None):
"""Return variables, as modified.
Convenience function to call `output` on a new
`~.datamodel.SpaceData` and return it.
Parameters
----------
suffix : str
Appended to the name of variables changed on output; see
`output` for details.
Returns
-------
`.datamodel.SpaceData`
Data read from input and processed according to the defined
operations.
See Also
--------
output
Examples
--------
>>> import spacepy.pycdf
>>> import spacepy.pycdf.istp
>>> infile = spacepy.pycdf.CDF('rbspa_rel04_ect-hope-PA-L3_20121201_v7.1.0.cdf')
>>> b = spacepy.pycdf.istp.VarBundle(infile['FPDU'])
>>> data = b.slice(1, 2, single=True).toSpaceData()
>>> infile.close()
>>> data.tree()
+
|____ENERGY_Ion_DELTA
|____Energy_LABL
|____Epoch_Ion
|____Epoch_Ion_DELTA
|____FPDU
|____HOPE_ENERGY_Ion
"""
sd = spacepy.SpaceData()
self.output(sd, suffix=suffix)
return sd
@staticmethod
def _vtype(v):
"""String representation of type of a variable
Parameters
----------
v
Open CDF variable, numpy array, or similar
Returns
-------
str
String representation of type of ``v``, either as CDF type
or numpy type
"""
# Kludge, but assumes main CDF code gets it right
res = str(v).split(' ')[0]
if res.startswith('CDF_'):
return res
return str(v.dtype)
def __str__(self):
"""String representation of the bundle
Returns a string representation of the bundle, which is all the
variables that are involved on the input. Variables which are
not included on the output are in []
Returns
-------
str
Brief string description of the bundle.
"""
return '\n'.join([
'{}{}: {} {}{}'.format(
' ' * 4 if self._varinfo[vname]['sortorder'] > 1 else '',
vname,
self._vtype(self.cdf[vname]),
str(list(shape)) if shape is not None else '---',
#RV vars always have dim 0 as axis 0, so they become
#NRV iff dim 0 of the main var goes away
' NRV' if shape is not None
and (not self._varinfo[vname]['rv'] or max(
self._degenerate[0], self._summed[0], self._mean[0]))
else ''
)
for dimvars in self.variables() for vname, shape in dimvars])
def __repr__(self):
"""Representation of bundle
Cannot return anything that can be evaluated to create a copy
of the CDF, so this is just the informal str representation in
angle brackets.
Returns
-------
str
Informal representation of bundle contents.
"""
return '<VarBundle:\n{}\n>'.format(str(self))