linted function file

95ef9371 · shawk masboob · 779f3c6f · 95ef9371
Commit 95ef9371 authored 4 years ago by shawk masboob
--- a/Topological_ML/tda_function.py
+++ b/Topological_ML/tda_function.py
 """
 This file contains all of the functions used within the notebooks.
-Date:
-Author:
+Date: April 24, 2020
+Author: Shawk Masboob
+
+The function `uniform_sampling` was borrowed from Luis Polancocontreras,
+a PhD candidate in the CMSE program at Michigan State University.
+It was slightly tweaked to fit this project.
 """
-import sklearn
-from sklearn.linear_model import LinearRegression
+from sklearn import ensemble
 import kmapper as km
-import pandas as pd
 import numpy as np
+import pandas as pd

 def numpy_to_pandas(sklearn_data):
-    """
-    Converts scikit-learn numpy data into pandas dataframe.
-    Input: name of dataframe
-    Output: pandas dataframe
+    """Converts scikit-learn numpy data into pandas dataframe
+
+    Args:
+        sklearn_data (array): name of dataframe
+
+    Returns:
+        panda.array: pandas dataframe
+
    """
    data = pd.DataFrame(data=sklearn_data.data, columns=sklearn_data.feature_names)
    data['target'] = pd.Series(sklearn_data.target)
    return data

-def linear_regression(feature, predictor):
-    """
-    Ordinary least squares Linear Regression.
-    input: x = independent variables
-           y = dependent variable
-    output: R^2
-    """
-    model = LinearRegression()
-    model.fit(feature, predictor)
-    return model.score(feature, predictor)
+def lens_1d(x_array, proj='l2norm', random_num=1729, verbosity=0):
+    """Creates a L^2-Norm for features. This lens highlights expected features in the data.
+
+    Args:
+        X (array): features of dataset
+        proj (string): projection type
+        random_num: random state
+        verbosity: verbosity
+
+    Returns:
+        lens: Isolation Forest, L^2-Norm
+        mapper:

-def lens_1d(features, random_num, verbosity):
-    """
-    input:
-    output:
    """
-    model = sklearn.ensemble.IsolationForest(random_state=random_num)
-    model.fit(features)
-    lens1 = model.decision_function(features).reshape((features.shape[0], 1))
+    if not type(x_array) == np.ndarray:
+        print("your input is not an array")
+        return None, None
+    if type(x_array) == np.ndarray and len(x_array.shape) != 2:
+        print('your input needs to be a 2d array')
+        return None, None
+    proj_type = ['sum', 'mean', 'median', 'max', 'min', 'std', 'dist_mean',
+                 'l2norm', 'knn_distance_n']
+    if proj not in proj_type:
+        print("you may only use the following projections:", proj_type)
+        return None, None
+    # Create a custom 1-D lens with Isolation Forest
+    model = ensemble.IsolationForest(random_state=random_num)
+    model.fit(x_array)
+    lens1 = model.decision_function(x_array).reshape((x_array.shape[0], 1))
+    # Create another 1-D lens with L2-norm
    mapper = km.KeplerMapper(verbose=verbosity)
-    lens2 = mapper.fit_transform(features, projection="l2norm")
+    lens2 = mapper.fit_transform(x_array, projection=proj)
+    # Combine lenses pairwise to get a 2-D lens i.e. [Isolation Forest, L^2-Norm] lens
    lens = np.c_[lens1, lens2]
-    return lens
+    return lens, mapper
+
+def uniform_sampling(dist_matrix, n_sample):
+    """Given a distance matrix retunrs an subsamplig that preserves the distribution
+    of the original data set and the covering radious corresponding to
+    the subsampled set.
+
+    Args:
+        dist_matrix (array): Distance matrix
+        n_sample (int): Size of subsample set
+
+    Returns:
+        list_subsample (array): List of indices corresponding to the subsample set.
+        np.max(dist_to_l): Covering radious for the subsample set.

-def county_crosstab(data, county, year, index, columns):
-    """
-    input:
-    output:
    """
-    subset_df = data[data.year == year]
-    sub_df = subset_df[subset_df.county == county]
-    crosstab = pd.crosstab(index=sub_df[index], columns=sub_df[columns])
-    return crosstab
+    if not type(dist_matrix) == np.ndarray:
+        print("your input is not an array")
+        return None, None
+    if type(dist_matrix) == np.ndarray and len(dist_matrix.shape) != 2:
+        print('your input needs to be a 2d array')
+        return None, None
+    n_subsample = int(n_sample)
+    if n_subsample <= 0:
+        print("Sampling size should be a positive integer.")
+        return None, None
+    num_points = dist_matrix.shape[0]
+    list_subsample = np.random.choice(num_points, n_subsample)
+    dist_to_l = np.min(dist_matrix[list_subsample, :], axis=0)
+    return list_subsample, np.max(dist_to_l)