From 95ef9371057bca80062a00a1fbde7f515856b92c Mon Sep 17 00:00:00 2001 From: shawk masboob <masboob.shawk@gmail.com> Date: Fri, 24 Apr 2020 02:34:06 -0400 Subject: [PATCH] linted function file --- Topological_ML/tda_function.py | 110 ++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 36 deletions(-) diff --git a/Topological_ML/tda_function.py b/Topological_ML/tda_function.py index a2e986d..43db154 100644 --- a/Topological_ML/tda_function.py +++ b/Topological_ML/tda_function.py @@ -1,54 +1,92 @@ """ This file contains all of the functions used within the notebooks. -Date: -Author: +Date: April 24, 2020 +Author: Shawk Masboob + +The function `uniform_sampling` was borrowed from Luis Polancocontreras, +a PhD candidate in the CMSE program at Michigan State University. +It was slightly tweaked to fit this project. """ -import sklearn -from sklearn.linear_model import LinearRegression +from sklearn import ensemble import kmapper as km -import pandas as pd import numpy as np +import pandas as pd def numpy_to_pandas(sklearn_data): - """ - Converts scikit-learn numpy data into pandas dataframe. - Input: name of dataframe - Output: pandas dataframe + """Converts scikit-learn numpy data into pandas dataframe + + Args: + sklearn_data (array): name of dataframe + + Returns: + panda.array: pandas dataframe + """ data = pd.DataFrame(data=sklearn_data.data, columns=sklearn_data.feature_names) data['target'] = pd.Series(sklearn_data.target) return data -def linear_regression(feature, predictor): - """ - Ordinary least squares Linear Regression. - input: x = independent variables - y = dependent variable - output: R^2 - """ - model = LinearRegression() - model.fit(feature, predictor) - return model.score(feature, predictor) +def lens_1d(x_array, proj='l2norm', random_num=1729, verbosity=0): + """Creates a L^2-Norm for features. This lens highlights expected features in the data. + + Args: + X (array): features of dataset + proj (string): projection type + random_num: random state + verbosity: verbosity + + Returns: + lens: Isolation Forest, L^2-Norm + mapper: -def lens_1d(features, random_num, verbosity): - """ - input: - output: """ - model = sklearn.ensemble.IsolationForest(random_state=random_num) - model.fit(features) - lens1 = model.decision_function(features).reshape((features.shape[0], 1)) + if not type(x_array) == np.ndarray: + print("your input is not an array") + return None, None + if type(x_array) == np.ndarray and len(x_array.shape) != 2: + print('your input needs to be a 2d array') + return None, None + proj_type = ['sum', 'mean', 'median', 'max', 'min', 'std', 'dist_mean', + 'l2norm', 'knn_distance_n'] + if proj not in proj_type: + print("you may only use the following projections:", proj_type) + return None, None + # Create a custom 1-D lens with Isolation Forest + model = ensemble.IsolationForest(random_state=random_num) + model.fit(x_array) + lens1 = model.decision_function(x_array).reshape((x_array.shape[0], 1)) + # Create another 1-D lens with L2-norm mapper = km.KeplerMapper(verbose=verbosity) - lens2 = mapper.fit_transform(features, projection="l2norm") + lens2 = mapper.fit_transform(x_array, projection=proj) + # Combine lenses pairwise to get a 2-D lens i.e. [Isolation Forest, L^2-Norm] lens lens = np.c_[lens1, lens2] - return lens + return lens, mapper + +def uniform_sampling(dist_matrix, n_sample): + """Given a distance matrix retunrs an subsamplig that preserves the distribution + of the original data set and the covering radious corresponding to + the subsampled set. + + Args: + dist_matrix (array): Distance matrix + n_sample (int): Size of subsample set + + Returns: + list_subsample (array): List of indices corresponding to the subsample set. + np.max(dist_to_l): Covering radious for the subsample set. -def county_crosstab(data, county, year, index, columns): - """ - input: - output: """ - subset_df = data[data.year == year] - sub_df = subset_df[subset_df.county == county] - crosstab = pd.crosstab(index=sub_df[index], columns=sub_df[columns]) - return crosstab + if not type(dist_matrix) == np.ndarray: + print("your input is not an array") + return None, None + if type(dist_matrix) == np.ndarray and len(dist_matrix.shape) != 2: + print('your input needs to be a 2d array') + return None, None + n_subsample = int(n_sample) + if n_subsample <= 0: + print("Sampling size should be a positive integer.") + return None, None + num_points = dist_matrix.shape[0] + list_subsample = np.random.choice(num_points, n_subsample) + dist_to_l = np.min(dist_matrix[list_subsample, :], axis=0) + return list_subsample, np.max(dist_to_l) -- GitLab