Skip to content
Snippets Groups Projects
Commit 95ef9371 authored by shawk masboob's avatar shawk masboob
Browse files

linted function file

parent 779f3c6f
No related branches found
No related tags found
No related merge requests found
"""
This file contains all of the functions used within the notebooks.
Date:
Author:
Date: April 24, 2020
Author: Shawk Masboob
The function `uniform_sampling` was borrowed from Luis Polancocontreras,
a PhD candidate in the CMSE program at Michigan State University.
It was slightly tweaked to fit this project.
"""
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn import ensemble
import kmapper as km
import pandas as pd
import numpy as np
import pandas as pd
def numpy_to_pandas(sklearn_data):
"""
Converts scikit-learn numpy data into pandas dataframe.
Input: name of dataframe
Output: pandas dataframe
"""Converts scikit-learn numpy data into pandas dataframe
Args:
sklearn_data (array): name of dataframe
Returns:
panda.array: pandas dataframe
"""
data = pd.DataFrame(data=sklearn_data.data, columns=sklearn_data.feature_names)
data['target'] = pd.Series(sklearn_data.target)
return data
def linear_regression(feature, predictor):
"""
Ordinary least squares Linear Regression.
input: x = independent variables
y = dependent variable
output: R^2
"""
model = LinearRegression()
model.fit(feature, predictor)
return model.score(feature, predictor)
def lens_1d(x_array, proj='l2norm', random_num=1729, verbosity=0):
"""Creates a L^2-Norm for features. This lens highlights expected features in the data.
Args:
X (array): features of dataset
proj (string): projection type
random_num: random state
verbosity: verbosity
Returns:
lens: Isolation Forest, L^2-Norm
mapper:
def lens_1d(features, random_num, verbosity):
"""
input:
output:
"""
model = sklearn.ensemble.IsolationForest(random_state=random_num)
model.fit(features)
lens1 = model.decision_function(features).reshape((features.shape[0], 1))
if not type(x_array) == np.ndarray:
print("your input is not an array")
return None, None
if type(x_array) == np.ndarray and len(x_array.shape) != 2:
print('your input needs to be a 2d array')
return None, None
proj_type = ['sum', 'mean', 'median', 'max', 'min', 'std', 'dist_mean',
'l2norm', 'knn_distance_n']
if proj not in proj_type:
print("you may only use the following projections:", proj_type)
return None, None
# Create a custom 1-D lens with Isolation Forest
model = ensemble.IsolationForest(random_state=random_num)
model.fit(x_array)
lens1 = model.decision_function(x_array).reshape((x_array.shape[0], 1))
# Create another 1-D lens with L2-norm
mapper = km.KeplerMapper(verbose=verbosity)
lens2 = mapper.fit_transform(features, projection="l2norm")
lens2 = mapper.fit_transform(x_array, projection=proj)
# Combine lenses pairwise to get a 2-D lens i.e. [Isolation Forest, L^2-Norm] lens
lens = np.c_[lens1, lens2]
return lens
return lens, mapper
def uniform_sampling(dist_matrix, n_sample):
"""Given a distance matrix retunrs an subsamplig that preserves the distribution
of the original data set and the covering radious corresponding to
the subsampled set.
Args:
dist_matrix (array): Distance matrix
n_sample (int): Size of subsample set
Returns:
list_subsample (array): List of indices corresponding to the subsample set.
np.max(dist_to_l): Covering radious for the subsample set.
def county_crosstab(data, county, year, index, columns):
"""
input:
output:
"""
subset_df = data[data.year == year]
sub_df = subset_df[subset_df.county == county]
crosstab = pd.crosstab(index=sub_df[index], columns=sub_df[columns])
return crosstab
if not type(dist_matrix) == np.ndarray:
print("your input is not an array")
return None, None
if type(dist_matrix) == np.ndarray and len(dist_matrix.shape) != 2:
print('your input needs to be a 2d array')
return None, None
n_subsample = int(n_sample)
if n_subsample <= 0:
print("Sampling size should be a positive integer.")
return None, None
num_points = dist_matrix.shape[0]
list_subsample = np.random.choice(num_points, n_subsample)
dist_to_l = np.min(dist_matrix[list_subsample, :], axis=0)
return list_subsample, np.max(dist_to_l)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment