diff --git a/Topological_ML/TDA_Prediction.py b/Topological_ML/TDA_Prediction.py index 147594cedfb197bd34483908ab891469207cac9e..83c56b396d95d659257cb92c07a0f6ea2f9490de 100644 --- a/Topological_ML/TDA_Prediction.py +++ b/Topological_ML/TDA_Prediction.py @@ -1,3 +1,14 @@ +from Topological_ML import TDA_Prediction as tdap +from sklearn.datasets import fetch_california_housing +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LinearRegression +import kmapper as km +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import sklearn +from sklearn import ensemble + def numpy_to_pandas(sklearn_data): """ Converts scikit-learn numpy data into pandas dataframe. @@ -8,16 +19,17 @@ def numpy_to_pandas(sklearn_data): df['response'] = pd.Series(sklearn_data.target) return df -def descriptive_statistic(df, n): +def data_summary(df, n): """ Provides brief descriptive statistics on dataset. - Input: df = dataframe - n = the first n rows of the dataframe - Output: shape, head, and descriptive statistics of dataframe + Input: name of dataframe + Output: dictionary """ - print("Shape : ", df.shape) - print("Head -- \n", df.head(n)) - print("Describe : ", df.describe()) + d = dict() + d['head'] = df.head(n) + d['shape'] = df.shape + #d['missing values'] = df.isna().sum() + return d def model_selection(df): """ @@ -38,29 +50,47 @@ def MSE_fit(fit): MSE = None return MSE -def accuracy_metrics(fit, MSE): +def accuracy_metrics(fit, MSE, n, k): """ This function is used for model validation. It returns a dictionary of several regression model accuracy metrics. Its inputs are a fitted model and the MSE of the fitted model. """ d = dict() - sumObj = None - SSE = None + y_hat = model.predict(X) + resid = y - y_hat + SSE = sum(resid**2) n = None p = None pr = None d['R2'] = None d['R2ad'] = None - d['AIC'] = None - d['BIC'] = None + d['AIC'] = 2*k - 2*ln(SSE) + d['BIC'] = n*ln(SSE/n) + k*ln(n) d['PRESS'] = None d['Cp']= None return None +def linear_regression(x, y): + """ + Ordinary least squares Linear Regression. + input: x = independent variables + y = dependent variable + output: R^2 + """ + model = LinearRegression() + model.fit(x, y) + return model.score(x ,y) -def mysqrt(n): - if n < 0: - n = 1.5*abs(n) - sqrt1 = n**(1/2) - return sqrt1 \ No newline at end of file +def lens_1d(X, rs, v): + """ + input: + output: + """ + model = sklearn.ensemble.IsolationForest(random_state = rs) + model.fit(X) + lens1 = model.decision_function(X).reshape((X.shape[0], 1)) + mapper = km.KeplerMapper(verbose = v) + lens2 = mapper.fit_transform(X, projection="l2norm") + lens = np.c_[lens1, lens2] + return lens \ No newline at end of file