Source code for loader

import struct
import numpy
import scipy.io
import os
import scipy.sparse as sps
from antk.lib import termcolor as tc
import tarfile
import os.path
import urllib
import numbers
import repr

##====================================================================================
##==========Proposed extensions=======================================================
##DataSet.split(scheme={devtraintest, crossvalidate, traintest} returns DataSets
##DataSets.join() returns DataSet (combines train or cross validation)
##DataSet + DataSet returns DataSet
##DataSets + DataSets returns DataSets
##DataSets constructor from list of DataSet objects

slash = '/'
if os.name == 'nt':
    slash = '\\'  # so this works in Windows


# ============================================================================================
# ==========EXCEPTIONS FOR MINIMAL DATA INTEGRITY CHECKS======================================
# ============================================================================================
[docs]class Bad_directory_structure_error(Exception): '''Raised when a data directory specified, does not contain a subfolder specified in the *folders* argument to :any:`read_data_sets`.''' pass
[docs]class Unsupported_format_error(Exception): '''Raised when a file is requested to be loaded or saved without one of the supported file extensions.''' pass
[docs]class Mat_format_error(Exception): '''Raised if the .mat file being read does not contain a variable named *data*.''' pass
[docs]class Sparse_format_error(Exception): '''Raised when reading a plain text file with .sparsetxt extension and there are not three entries per line.''' pass
# ============================================================================================== # =============================DATA STRUCTURES================================================== # ==============================================================================================
[docs]class DataSet(object): """ General data structure for mini-batch gradient descent training involving non-sequential data. :param features: A dictionary of string label names to data matrices. Matrices may be :any:`HotIndex`, scipy sparse csr_matrix, or numpy arrays. :param labels: A dictionary of string label names to data matrices. Matrices may be :any:`HotIndex`, scipy sparse csr_matrix, or numpy arrays. :param num_examples: How many data points. :param mix: Whether or not to shuffle per epoch. :return: """ def __init__(self, features, labels=None, num_examples=None, mix=False): self._features = features # hashmap of feature matrices if num_examples: self._num_examples = num_examples else: try: if labels: self._num_examples = labels[labels.keys()[0]].shape[0] else: self._num_examples = features[features.keys()[0]].shape[0] except IndexError: self._num_examples = -1 if labels: self._labels = labels # hashmap of label matrices else: self._labels = {} self._index_in_epoch = 0 self._mix_after_epoch = mix def __repr__(self): attrs = vars(self) return 'antk.core.DataSet object with fields:\n' + '\n'.join("\t%r: %r" % item for item in attrs.items()) # ====================================================================================== # =============================PROPERTIES=============================================== # ====================================================================================== @property def features(self): '''A dictionary of feature matrices.''' return self._features @property def index_in_epoch(self): '''The number of datapoints that have been trained on in a particular epoch.''' return self._index_in_epoch @property def labels(self): '''A dictionary of label matrices''' return self._labels @property def num_examples(self): '''Number of rows (data points) of the matrices in this :any:`DataSet`.''' return self._num_examples # ====================================================================================== # =============================PUBLIC METHODS=========================================== # ======================================================================================
[docs] def mix_after_epoch(self, mix): """ Whether or not to shuffle after training for an epoch. :param mix: True or False """ self._mix_after_epoch = mix
[docs] def next_batch(self, batch_size): ''' Return a sub DataSet of next batch-size examples. If shuffling enabled: If `batch_size` is greater than the number of examples left in the epoch then a batch size DataSet wrapping back to beginning will be returned. If shuffling turned off: If `batch_size` is greater than the number of examples left in the epoch, points will be shuffled and batch_size DataSet is returned starting from index 0. :param batch_size: int :return: A :any:`DataSet` object with the next `batch_size` examples. ''' assert batch_size <= self._num_examples start = self._index_in_epoch if self._index_in_epoch + batch_size > self._num_examples: if not self._mix_after_epoch: self._index_in_epoch = (self._index_in_epoch + batch_size) % self._num_examples # if self._index_in_epoch == 0: # end = self._num_examples # else: # end = self._index_in_epoch end = self._index_in_epoch newbatch = DataSet(self._next_batch_(self._features, start, end), self._next_batch_(self._labels, start, end), batch_size) else: perm = numpy.arange(self._num_examples) numpy.random.shuffle(perm) self._shuffle_(perm, self._features) self._shuffle_(perm, self._labels) start = 0 end = batch_size newbatch = DataSet(self._next_batch_(self._features, start, end), self._next_batch_(self._labels, start, end), batch_size) self._index_in_epoch = batch_size return newbatch else: end = self._index_in_epoch + batch_size self._index_in_epoch = (batch_size + self._index_in_epoch) % self._num_examples if self._index_in_epoch == 0 and self._mix_after_epoch: perm = numpy.arange(self._num_examples) numpy.random.shuffle(perm) self._shuffle_(perm, self._features) self._shuffle_(perm, self._labels) return DataSet(self._next_batch_(self._features, start, end), self._next_batch_(self._labels, start, end), batch_size)
[docs] def show(self): ''' Pretty printing of all the data (dimensions, keys, type) in the :any:`DataSet` object ''' print('features:') for name, feature, in self.features.iteritems(): if type(feature) is HotIndex: print('\t %s: vec.shape: %s dim: %s %s' % (name, feature.vec.shape, feature.dim, type(feature))) else: print('\t %s: %s %s' % (name, feature.shape, type(feature))) print('labels:') for name, label in self.labels.iteritems(): if type(label) is HotIndex: print('\t %s: vec.shape: %s dim: %s %s' % (name, (label.vec.shape), label.dim, type(label))) else: print('\t %s: %s %s' % (name, label.shape, type(label)))
[docs] def showmore(self): ''' Print a sample of the first up to twenty rows of matrices in DataSet ''' print('features:') for name, feature in self.features.iteritems(): print('\t %s: \nFirst twenty rows:\n%s\n' % (name, feature[1:min(20, feature.shape[0])])) print('labels:') for name, label in self.labels.iteritems(): print('\t %s: \nFirst twenty rows:\n%s\n' % (name, feature[1:min(20, feature.shape[0])]))
# ====================================================================================== # =============================PRIVATE METHODS=========================================== # ====================================================================================== def _shuffle_(self, order, datamap): ''' :param order: A list of the indices for the row permutation :param datamap: :return: void Shuffles the rows an individual matrix in the :any:`DataSet` object.' ''' for matrix in datamap: if type(datamap[matrix]) is HotIndex: datamap[matrix] = HotIndex(datamap[matrix].vec[order], datamap[matrix].dim) else: datamap[matrix] = datamap[matrix][order] def _next_batch_(self, datamap, start, end=None): ''' :param datamap: A hash map of matrices :param start: starting row :param end: ending row :return: A hash map of slices of matrices from row start to row end ''' if end is None: end = self._num_examples batch_data_map = {} if end <= start: start2 = 0 end2 = end end = self._num_examples wrapdata = {} for matrix in datamap: if type(datamap[matrix]) is HotIndex: wrapdata[matrix] = datamap[matrix].vec[start2:end2] batch_data_map[matrix] = datamap[matrix].vec[start:end] else: wrapdata[matrix] = datamap[matrix][start2:end2] batch_data_map[matrix] = datamap[matrix][start:end] if sps.issparse(batch_data_map[matrix]): batch_data_map[matrix] = sps.vstack([batch_data_map[matrix], wrapdata[matrix]]) else: batch_data_map[matrix] = numpy.concatenate([batch_data_map[matrix], wrapdata[matrix]], axis=0) else: for matrix in datamap: if type(datamap[matrix]) is HotIndex: batch_data_map[matrix] = datamap[matrix].vec[start:end] else: batch_data_map[matrix] = datamap[matrix][start:end] return batch_data_map
[docs]class DataSets(object): ''' A record of DataSet objects with a display function. ''' def __init__(self, datasets_map, mix=False): for k, v in datasets_map.items(): setattr(self, k, DataSet(v['features'], v['labels'], v['num_examples'], mix=mix)) def __repr__(self): attrs = vars(self) return 'antk.core.DataSets object with fields:\n' + '\n'.join("\t%s: %s" % item for item in attrs.items())
[docs] def show(self): """ Pretty print data attributes. """ datasets = [s for s in dir(self) if not s.startswith('__') and not s == 'show' and not s == 'showmore'] for dataset in datasets: print tc.colored(dataset + ':', 'yellow') getattr(self, dataset).show()
[docs] def showmore(self): """ Pretty print data attributes, and data. """ datasets = [s for s in dir(self) if not s.startswith('__') and not s == 'show' and not s == 'showmore'] for dataset in datasets: print tc.colored(dataset + ':', 'yellow') getattr(self, dataset).showmore()
[docs]class HotIndex(object): """ Index vector representation of one hot matrix. Can hand constructor either a one hot matrix, or vector of indices and dimension. """ def __init__(self, matrix, dimension=None): if is_one_hot(matrix): self._dim = matrix.shape[1] self._vec = toIndex(matrix).flatten() else: self._dim = dimension self._vec = numpy.array(matrix).flatten() def __repr__(self): vector = repr.repr(self._vec.tolist()) return '%s(shape=%s)\nvec=%s' % (type(self), self.shape, vector) def __str__(self): return '%s(shape=%s)\n%s' % (type(self), self.shape, self._vec) def __len__(self): return self._vec.shape[0] def __getitem__(self, index): cls = type(self) if isinstance(index, numbers.Integral): return cls(self._vec[index], self.dim) elif isinstance(index, HotIndex): return cls(self._vec[index._vec], self.dim) else: try: return cls(self._vec[index], self._dim) except IndexError: return int(self._vec[index[0]] == index[1]) @property def dim(self): '''The feature dimension of the one hot vector represented as indices.''' return self._dim @property def vec(self): '''The vector of hot indices.''' return self._vec @property def shape(self): '''The shape of the one hot matrix encoded.''' return (self.vec.shape[0], self.dim)
[docs] def hot(self): """ :return: A one hot scipy sparse csr_matrix """ return toOnehot(self)
[docs]class IndexVector(HotIndex): pass
# =================================================================================== # ====================== I/0 ======================================================== # ===================================================================================
[docs]def load(filename): ''' Calls :any:`import_data`. Decides how to load data into python matrices by file extension. Raises :any:`Unsupported_format_error` if extension is not one of the supported extensions (mat, sparse, binary, dense, sparsetxt, densetxt, index). :param filename: A file of an accepted format representing a matrix. :return: A numpy matrix, scipy sparse csr_matrix, or any:`HotIndex`. ''' return import_data(filename)
[docs]def import_data(filename): ''' Decides how to load data into python matrices by file extension. Raises :any:`Unsupported_format_error` if extension is not one of the supported extensions (mat, sparse, binary, dense, sparsetxt, densetxt, index). :param filename: A file of an accepted format representing a matrix. :return: A numpy matrix, scipy sparse csr_matrix, or any:`HotIndex`. ''' extension = filename.split(slash)[-1].split('.')[-1].strip() if extension == 'mat': mat_file_map = scipy.io.loadmat(filename) if 'data' not in mat_file_map: raise Mat_format_error('Matrix in .mat file ' + filename + ' must be named "data"') if mat_file_map['data'].shape[0] == 1: return numpy.transpose(mat_file_map['data']) else: return mat_file_map['data'] elif extension == 'index': return _imatload(filename) elif extension == 'sparse': return _smatload(filename) elif extension == 'binary' or extension == 'dense': return _matload(filename) elif extension == 'sparsetxt': X = numpy.loadtxt(filename) if X.shape[1] != 3: raise Sparse_format_error('Sparse Format: row col val') if numpy.amin(X[:, 2]) == 1 or numpy.amin(X[:, 1] == 1): X[:, 0] = X[:, 0] - 1 X[:, 1] = X[:, 1] - 1 return sps.csr_matrix((X[:, 2], (X[:, 0], X[:, 1]))) elif extension == 'densetxt': return numpy.loadtxt(filename) else: raise Unsupported_format_error('Supported extensions: ' 'mat, sparse, binary, sparsetxt, densetxt, index')
[docs]def save(filename, data): ''' Calls :any`export_data`. Decides how to save data by file extension. Raises :any:`Unsupported_format_error` if extension is not one of the supported extensions (mat, sparse, binary, dense, index). Data contained in .mat files should be saved in a matrix named *data*. :param filename: A file of an accepted format representing a matrix. :param data: A numpy array, scipy sparse matrix, or :any:`HotIndex` object. ''' export_data(filename, data)
[docs]def export_data(filename, data): ''' Decides how to save data by file extension. Raises :any:`Unsupported_format_error` if extension is not one of the supported extensions (mat, sparse, binary, dense, index). Data contained in .mat files should be saved in a matrix named *data*. :param filename: A file of an accepted format representing a matrix. :param data: A numpy array, scipy sparse matrix, or :any:`HotIndex` object. ''' extension = filename.split(slash)[-1].split('.')[-1].strip() if extension == 'mat': scipy.io.savemat(filename, {'data': data}) elif extension == 'index': if not isinstance(data, HotIndex): raise Unsupported_format_error('Only HotIndex objects may be saved in .index format.') _imatsave(filename, data) elif extension == 'sparse': if not sps.issparse(data): raise Unsupported_format_error('Only scipy sparse matrices may be saved in .sparse format.') _smatsave(filename, data) elif extension == 'binary' or extension == 'dense': if sps.issparse(data): raise Unsupported_format_error('Only numpy 2d arrays may be saved in .binary or .dense format.') _matsave(filename, data) elif extension == 'densetxt': if sps.issparse(data): raise Unsupported_format_error('Only numpy 2d arrays may be saved in .densetxt format') numpy.savetxt(filename, data) elif extension == 'sparsetxt': if not sps.issparse(data): raise Unsupported_format_error('Only scipy sparse matrices my be saved in .sparsetxt format.') scipy.io.mmwrite(filename, data) with open(filename + '.mtx', 'r') as f: lines = f.read().strip().split('\n') os.system('rm ' + filename + '.mtx') matrixstr = '\n'.join(lines[2:-1]) with open(filename, 'w') as f: f.write(matrixstr) else: raise Unsupported_format_error('Supported extensions: ' 'mat, sparse, binary, dense, index, sparsetxt, densetxt')
def _write_int64(file_obj, num): """ Writes an 8 byte integer to a file in binary format. From David Palzer. :param file_obj: the open file object to write to :param num: the integer to write, will be converted to a long int """ file_obj.write(struct.pack('q', long(num))) def _read_int64(file_obj): """ Reads an 8 byte binary integer from a file. From David Palzer. :param file_obj: The open file object from which to read. :return: The eight bytes read from the file object interpreted as a long int. """ return struct.unpack('q', file_obj.read(8))[0] def _matload(filename): """ Reads in a dense matrix from binary (dense) file filename. From `David Palzer`_. :param filename: file from which to read. :return: the matrix which has been read. """ f = open(filename, 'r') m = _read_int64(f) n = _read_int64(f) x = numpy.fromfile(f, numpy.dtype(numpy.float64), -1, "") x = x.reshape((m, n), order="FORTRAN") f.close() return numpy.array(x) def _matsave(filename, x): """ Saves the input matrix to the input file in dense format. From `David Palzer`_. :param filename: file to write to :param x: matrix to write """ if len(x.shape) == 1: x = numpy.reshape(x, (x.shape[0], 1)) f = open(filename, 'wb') _write_int64(f, x.shape[0]) _write_int64(f, x.shape[1]) x.astype(numpy.float64, copy=False).ravel('FORTRAN').tofile(f) f.close() def _smatload(filename): """ Reads in a sparse matrix from file. From `David Palzer`_. :param filename: the file from which to read :return: a sparse matrix created from the sparse data """ f = open(filename, 'r') row = _read_int64(f) col = _read_int64(f) nnz = _read_int64(f) S = numpy.fromfile(f, 'd', 3 * nnz) f.close() S = S.reshape((nnz, 3)) rows = S[:, 0].astype(int) - 1 cols = S[:, 1].astype(int) - 1 vals = S[:, 2] return sps.csr_matrix((vals, (rows, cols)), shape=(row, col)) def _smatsave(filename, t): """ Saves the input matrix to the input file in sparse format. From `David Palzer`_. :param filename: :param t: """ t = sps.csr_matrix(t,copy=False) f = open(filename, 'wb') _write_int64(f, t.shape[0]) _write_int64(f, t.shape[1]) indices = t.nonzero() idxs = numpy.vstack((indices[0], indices[1])) _write_int64(f, len(indices[1])) ti = numpy.mat(t[indices]) indices = numpy.concatenate((idxs, ti)) indices[0:2, :] += 1 indices.astype(float, copy=False).ravel('F').tofile(f) f.close() def _imatload(filename): """ Reads in a :any:`HotIndex` matrix from file :param filename: the file from which to read where a :any:`HotIndex` object was stored. :return: A :any:`HotIndex` object. """ f = open(filename, 'r') vec_length = _read_int64(f) dim = _read_int64(f) vec = numpy.fromfile(f, 'd', vec_length) f.close() vec = vec.astype(int) - 1 return HotIndex(vec, dim) def _imatsave(filename, index_vec): """ Saves the input matrix to the input file in sparse format :param filename: Filename to save to. :param index_vec: A :any:`HotIndex` object. """ f = open(filename, 'wb') vector = index_vec.vec vector = vector + 1 _write_int64(f, vector.shape[0]) _write_int64(f, index_vec.dim) vector.astype(float, copy=False).tofile(f) f.close()
[docs]def makedirs(datadirectory, sub_directory_list=('train', 'dev', 'test')): ''' :param datadirectory: Name of the directory you want to create containing the subdirectory folders. If the directory already exists it will be populated with the subdirectory folders. :param sub_directory_list: The list of subdirectories you want to create :return: void ''' if not datadirectory.endswith(slash): datadirectory += slash os.system('mkdir ' + datadirectory) for sub in sub_directory_list: os.system('mkdir ' + datadirectory + sub)
[docs]def read_data_sets(directory, folders=('train', 'dev', 'test'), hashlist=(), mix=False): ''' :param directory: Root directory containing data to load. :param folders: The subfolders of *directory* to read data from by default there are train, dev, and test folders. If you want others you have to make an explicit list. :param hashlist: If you provide a hashlist these files and only these files will be added to your :any:`DataSet` objects. It you do not provide a hashlist then anything with the privileged prefixes labels_ or features_ will be loaded. :return: A :any:`DataSets` object. ''' if not directory.endswith(slash): directory += slash dir_files = os.listdir(directory) datasets_map = {} for folder in folders: # iterates over keys dataset_map = {'features': {}, 'labels': {}, 'num_examples': 0} print('reading ' + folder + '...') if folder not in dir_files: raise Bad_directory_structure_error('Need ' + folder + ' folder in ' + directory + ' directory.') file_list = os.listdir(directory + folder) for filename in file_list: prefix = filename.split('_')[0] if prefix == 'features' or prefix == 'labels': prefix_ = prefix + '_' descriptor = (filename.split('.')[0]).split(prefix_)[-1] if (not hashlist) or (descriptor in hashlist): dataset_map[prefix][descriptor] = import_data(directory + folder + slash + filename) if prefix == 'labels': dataset_map['num_examples'] = dataset_map[prefix][descriptor].shape[0] datasets_map[folder] = dataset_map return DataSets(datasets_map, mix=mix)
# =================================================================================== # ====================== DATA MANIPULATION=========================================== # ===================================================================================
[docs]def toOnehot(X, dim=None): ''' :param X: Vector of indices or :any:`HotIndex` object :param dim: Dimension of indexing :return: A sparse csr_matrix of one hots. Examples -------- >>> import numpy >>> from antk.core import loader >>> x = numpy.array([0, 1, 2, 3]) >>> loader.toOnehot(x) #doctest: +ELLIPSIS <4x4 sparse matrix of type '<type 'numpy.float64'>'... >>> loader.toOnehot(x).toarray() array([[ 1., 0., 0., 0.], [ 0., 1., 0., 0.], [ 0., 0., 1., 0.], [ 0., 0., 0., 1.]]) >>> x = loader.HotIndex(x, dimension=8) >>> loader.toOnehot(x).toarray() array([[ 1., 0., 0., 0., 0., 0., 0., 0.], [ 0., 1., 0., 0., 0., 0., 0., 0.], [ 0., 0., 1., 0., 0., 0., 0., 0.], [ 0., 0., 0., 1., 0., 0., 0., 0.]]) ''' if isinstance(X, HotIndex): dim = X.dim X = X.vec else: if dim is None: dim = numpy.amax(X) + 1 return sps.csr_matrix(([1.0]*X.shape[0], (range(X.shape[0]), X.astype(int))), shape=(X.shape[0], dim))
[docs]def is_one_hot(A): ''' :param A: A numpy array or scipy sparse matrix :return: True if matrix is a sparse matrix of one hot vectors, False otherwise Examples -------- >>> import numpy >>> from antk.core import loader >>> x = numpy.eye(3) >>> loader.is_one_hot(x) True >>> x *= 5 >>> loader.is_one_hot(x) False >>> x = numpy.array([[1, 0, 0], [1, 0, 0], [1, 0, 0]]) >>> loader.is_one_hot(x) True >>> x[0,1] = 2 >>> loader.is_one_hot(x) False ''' A = sps.csr_matrix(A) (i, j, v) = sps.find(A) return (numpy.sum(v) == A.shape[0] and numpy.unique(i).shape[0] == A.shape[0] and numpy.unique(v).shape[0] == 1 and numpy.unique(v)[0] == 1)
[docs]def toIndex(A): ''' :param A: A matrix of one hot row vectors. :return: The hot indices. Examples -------- >>> import numpy >>> from antk.core import loader >>> x = numpy.array([[1,0,0], [0,0,1], [1,0,0]]) >>> loader.toIndex(x) array([0, 2, 0]) ''' if is_one_hot(A): if sps.issparse(A): return sps.find(A)[1] else: return numpy.nonzero(A)[1] else: raise ValueError('Argument to function must be a one hot matrix.')
[docs]def center(X, axis=None): """ :param X: A matrix to center about the mean(over columns axis=0, over rows axis=1, over all entries axis=None) :return: A matrix with entries centered along the specified axis. """ if sps.issparse(X): X = X.todense() return sps.csr_matrix(X - numpy.mean(X, axis=axis)) else: return X - numpy.mean(X, axis=axis)
[docs]def unit_variance(X, axis=None): """ :param X: A matrix to transfrom to have unit variance (over columns axis=0, over rows axis=1, over all entries axis=None) :return: A matrix with unit variance along the specified axis. """ if sps.isspparse(X): X = X.todense() return sps.csr_matrix(X / numpy.std(X, axis=axis)) else: return X / numpy.std(X, axis=axis)
[docs]def pca_whiten(X): """ Returns matrix with PCA whitening transform applied. This transform assumes that data points are rows of matrix. :param X: Numpy array, scipy sparse matrix :param axis: Axis to whiten over. :return: """ if sps.issparse(X): return sps.csr_matrix(pca_whiten(X.todense())) else: X -= numpy.mean(X, axis=0) cov = numpy.dot(X.T, X)/X.shape[0] U, S, V = numpy.linalg.svd(cov) Xrot = numpy.dot(X, U) return Xrot/numpy.sqrt(S + 1e-5)
# =================================================== # Normalizations for tfidf or whatever # ====================================================
[docs]def l2normalize(X, axis=1): """ axis=1 normalizes each row of X by norm of said row. :math:`l2normalize(X)_{ij} = \\frac{X_{ij}}{\sqrt{\sum_k X_{ ik}^2}}` axis=0 normalizes each column of X by norm of said column. :math:`l2normalize(X)_{ij} = \\frac{X_{ij}}{\sqrt{\sum_k X_{kj}^2}}` axis=None normalizes entries of X by norm of X. :math:`l2normalize(X)_{ij} = \\frac{X_{ij}}{\sqrt{\sum_k \sum_p X_{kp}^2}}` :param X: A scipy sparse csr_matrix or numpy array. :param axis: The dimension to normalize over. :return: A normalized matrix. """ if sps.issparse(X): X = X.toarray() normalized_matrix = X/numpy.linalg.norm(X, ord=2, axis=axis, keepdims=True) if sps.issparse(X): normalized_matrix = sps.csr_matrix(normalized_matrix) return normalized_matrix
[docs]def l1normalize(X, axis=1): """ axis=1 normalizes each row of X by norm of said row. :math:`l1normalize(X)_{ij} = \\frac{X_{ij}}{\sum_k |X_{ik}|}` axis=0 normalizes each column of X by norm of said column. :math:`l1normalize(X)_{ij} = \\frac{X_{ij}}{\sum_k |X_{kj}|}` axis=None normalizes entries of X by norm of X. :math:`l1normalize(X)_{ij} = \\frac{X_{ij}}{\sum_k \sum_p |X_{kp}|}` :param X: A scipy sparse csr_matrix or numpy array. :param axis: The dimension to normalize over. :return: A normalized matrix. """ if sps.issparse(X): X = X.toarray() normalized_matrix = X/numpy.linalg.norm(X, ord=1, axis=axis, keepdims=True) if sps.issparse(X): normalized_matrix = sps.csr_matrix(normalized_matrix) return normalized_matrix
[docs]def maxnormalize(X, axis=1): """ axis=1 normalizes each row of X by norm of said row. :math:`maxnormalize(X)_{ij} = \\frac{X_{ij}}{max(X_{i:})}` axis=0 normalizes each column of X by norm of said column. :math:`maxnormalize(X)_{ij} = \\frac{X_{ij}}{max(X_{ :j})}` axis=None normalizes entries of X norm of X. :math:`maxnormalize(X)_{ij} = \\frac{X_{ij}}{max(X)}` :param X: A scipy sparse csr_matrix or numpy array. :param axis: The dimension to normalize over. :return: A normalized matrix. """ if sps.issparse(X): X = X.toarray() normalized_matrix = X/numpy.linalg.norm(X, ord=numpy.inf, axis=axis, keepdims=True) if sps.issparse(X): normalized_matrix = sps.csr_matrix(normalized_matrix) return normalized_matrix
NORM = {'l2': l2normalize, 'count': l1normalize, 'max': maxnormalize}
[docs]def tfidf(X, norm='l2row'): """ :param X: A document-term matrix. :param norm: Normalization strategy: 'l2row': normalizes the scores of rows by length of rows after basic tfidf (each document vector is a unit vector), 'count': normalizes the scores of rows by the the total word count of a document. 'max' normalizes the scores of rows by the maximum count for a single word in a document. :return: Returns tfidf of document-term matrix X with optional normalization. """ idf = numpy.log(X.shape[0]/X.sign().sum(0)) # make a diagonal matrix of idf values to matrix multiply with tf. IDF = sps.csr_matrix((idf.tolist()[0], (range(X.shape[1]), range(X.shape[1])))) if norm == 'count' or norm == 'max': # Only normalizing tf return sps.csr_matrix(NORM[norm](X)*IDF) elif norm == 'l2': # normalizing tfidf return NORM[norm](X*IDF) else: # no normalization return X*IDF
# ======================================================== # ==================MISC================================== # ========================================================
[docs]def untar(fname): if (fname.endswith("tar.gz")): tar = tarfile.open(fname) tar.extractall() tar.close() print("Extracted " + fname + " in Current Directory") else: print("Not a tar.gz file: '%s '" % fname)
[docs]def maybe_download(filename, work_directory, source_url): """ Download the data from source url, unless it's already here. From https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/learn/python/learn/datasets/base.py :param filename: string, name of the file in the directory. :param work_directory: string, path to working directory. :param source_url: url to download from if file doesn't exist. :return: Path to resulting file. """ filepath = os.path.join(work_directory, filename) if not os.path.isfile(filepath): urlopen = urllib.URLopener() urlopen.retrieve(source_url, filename) return filepath