Source code for maidenhair.loaders.base

#!/usr/bin/env python
# coding=utf-8
"""
An abstract loader class

"""
__author__  = 'Alisue (lambdalisue@hashnote.net)'
import os
import warnings
import itertools
import numpy as np
from glob import glob
from natsort import natsorted
from maidenhair.utils.rglob import glob as rglob


[docs]class BaseLoader(object): """ A abstract loader class """ def __init__(self, using=None, parser=None): """ Construct loader class Parameters ---------- using : list of integer or slice instance, optional A default list of index or slice instance. It will be used when :attr:`using` is not specified in :meth:`maidenhair.loader.base.BaseLoader.load` method. parser : instance or None, optional A default instance of parser class. It will be used when :attr:`parser` is not specified in :meth:`maidenhair.loader.base.BaseLoader.load` method. """ self.using = using self.parser = parser
[docs] def load(self, filename, using=None, parser=None, **kwargs): """ Load data from file using a specified parser. Return value will be separated or sliced into a column list Parameters ---------- filename : string A data file path using : list of integer, slice instance, or None, optional A list of index or slice instance used to slice data into column If it is not specified, :attr:`using` specified in constructor will be used instead. parser : instance or None, optional An instance or registered name of parser class. If it is not specified, :attr:`parser` specified in constructor will be used instead. Returns ------- ndarray A list of numpy array """ using = using or self.using parser = parser or self.parser if parser is None: raise AttributeError("A parser instance must be specified") # parse iterator with the specified parser data = parser.load(filename, **kwargs) # slice column by using return slice_columns(data, using)
[docs] def glob(self, pathname, using=None, unite=False, basecolumn=0, parser=None, with_filename=False, recursive=False, natsort=True, **kwargs): """ Load data from file matched with given glob pattern. Return value will be a list of data unless :attr:`unite` is `True`. If :attr:`unite` is `True`, all dataset will be united into a single data. Parameters ---------- pathname : string A glob pattern using : list of integer, slice instance, or None, optional A list of index or slice instance used to slice data into column If it is not specified, :attr:`using` specified in constructor will be used instead. unite : boolean, optional: If it is `True` then dataset will be united into a single numpy array. See usage for more detail. basecolumn : integer, optional An index of base column. all data will be trimmed based on the order of this column when the number of samples are different among the dataset. It only affect when :attr:`unite` is specified as `True`. parser : instance, optional An instance or registered name of parser class. If it is not specified, :attr:`parser` specified in constructor will be used instead. with_filename : boolean, optional If it is `True`, returning dataset will contain filename in the first column. It is cannot be used with :attr:`unite = True` recursive : boolean, optional Recursively find pattern in the directory natsort : boolean Naturally sort found files. Returns ------- ndarray A list of numpy array """ # argument check if unite and with_filename: raise AttributeError( "`with_filename` attribute cannot be set True when " "`unite` attribute was set True.") # make sure that the pathname is absolute pathname = os.path.abspath(pathname) if recursive: filelist = rglob(pathname) else: filelist = glob(pathname) if natsort: filelist = natsorted(filelist, number_type=None) # create dataset dataset =[] for filename in filelist: data = self.load( filename=filename, using=using, parser=parser, **kwargs) if with_filename: data = [filename] + data dataset.append(data) # tell the number of files found if verbose is True if kwargs.get('verbose', False): print "%d files are found with `%s`" % ( len(dataset), os.path.relpath(pathname)) # warn if nothing have found unless quiet is True if len(dataset) == 0 and not kwargs.get('quiet', False): warnings.warn("Nothing found with glob pattern '%s'" % pathname) # unite dataset if specified if unite and len(dataset) > 0: dataset = unite_dataset(dataset, basecolumn) return dataset
[docs]def slice_columns(x, using=None): """ Slice a numpy array to make columns Parameters ---------- x : ndarray A numpy array instance using : list of integer or slice instance or None, optional A list of index or slice instance Returns ------- ndarray A list of numpy array columns sliced """ if using is None: using = range(0, len(x[0])) return [x[:,s] for s in using]
[docs]def unite_dataset(dataset, basecolumn=0): """ Unite dataset into a single data Parameters ---------- dataset : list of ndarray A data list of a column list of a numpy arrays basecolumn : integer, optional An index of base column. All data will be trimmed based on the order of this column when the number of samples are different among the dataset Returns ------- list of numpy array A column list of a numpy array """ ndata = [None] * len(dataset[0]) for pdata in dataset: # select basecolumn bnx = ndata[basecolumn] bpx = pdata[basecolumn] if bnx is not None and bnx.ndim >= 2: bnx = bnx[:,-1] if bpx is not None and bpx.ndim >= 2: bpx = bpx[:,-1] # calculate min and max of this and final data if bnx is not None and len(bnx) != len(bpx): # the number of samples is different, so regulation is required xmin = max(np.min(bnx), np.min(bpx)) xmax = min(np.max(bnx), np.max(bpx)) # slice the data nindex = np.where((bnx>xmin) & (bnx<xmax)) pindex = np.where((bpx>xmin) & (bpx<xmax)) else: nindex = None pindex = None for i, (nx, px) in enumerate(itertools.izip(ndata, pdata)): if nindex: nx = nx[nindex] if pindex: px = px[pindex] ndata[i] = px if nx is None else np.c_[nx, px] return [ndata]