diff --git a/TDA_Prediction.ipynb b/TDA_Prediction.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..6af9c03d8502abf3b19cd242809e5888ebfa5f84 --- /dev/null +++ b/TDA_Prediction.ipynb @@ -0,0 +1,160 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h1><center>Prediction using Topological Data Analysis</center></h1>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Give brief overview of notebook's purpose.\n", + "Also maybe add cool picture." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from Topological_ML import TDA_Prediction as tdap\n", + "from sklearn.datasets import fetch_california_housing\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "cal_housing = fetch_california_housing()\n", + "\n", + "def numpy_to_pandas(sklearn_data):\n", + " df = pd.DataFrame(data = sklearn_data.data, columns = sklearn_data.feature_names)\n", + " df['response'] = pd.Series(sklearn_data.target)\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "df = numpy_to_pandas(cal_housing)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape : (20640, 9)\n", + "Head -- \n", + " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", + "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", + "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", + "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", + "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", + "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", + "\n", + " Longitude target \n", + "0 -122.23 4.526 \n", + "1 -122.22 3.585 \n", + "2 -122.24 3.521 \n", + "3 -122.25 3.413 \n", + "4 -122.25 3.422 \n", + "Describe : MedInc HouseAge AveRooms AveBedrms Population \\\n", + "count 20640.000000 20640.000000 20640.000000 20640.000000 20640.000000 \n", + "mean 3.870671 28.639486 5.429000 1.096675 1425.476744 \n", + "std 1.899822 12.585558 2.474173 0.473911 1132.462122 \n", + "min 0.499900 1.000000 0.846154 0.333333 3.000000 \n", + "25% 2.563400 18.000000 4.440716 1.006079 787.000000 \n", + "50% 3.534800 29.000000 5.229129 1.048780 1166.000000 \n", + "75% 4.743250 37.000000 6.052381 1.099526 1725.000000 \n", + "max 15.000100 52.000000 141.909091 34.066667 35682.000000 \n", + "\n", + " AveOccup Latitude Longitude target \n", + "count 20640.000000 20640.000000 20640.000000 20640.000000 \n", + "mean 3.070655 35.631861 -119.569704 2.068558 \n", + "std 10.386050 2.135952 2.003532 1.153956 \n", + "min 0.692308 32.540000 -124.350000 0.149990 \n", + "25% 2.429741 33.930000 -121.800000 1.196000 \n", + "50% 2.818116 34.260000 -118.490000 1.797000 \n", + "75% 3.282261 37.710000 -118.010000 2.647250 \n", + "max 1243.333333 41.950000 -114.310000 5.000010 \n", + "None\n" + ] + } + ], + "source": [ + "def descriptive_statistic(df, n):\n", + " \"\"\"\n", + " Provides brief descriptive statistics on dataset. \n", + " Takes dataframe as input.\n", + " \"\"\"\n", + " print(\"Shape : \", df.shape)\n", + " print(\"Head -- \\n\", df.head(n))\n", + " print(\"Describe : \", df.describe())\n", + " \n", + "descriptive_statistic(df, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def test_srt_rt1():\n", + " assert tdap.mysqrt(-4) == 2\n", + " return\n", + "\n", + "def test_srt_rt2():\n", + " assert tdap.mysqrt(4) == 2\n", + " return\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Topological_ML/TDA_Prediction.py b/Topological_ML/TDA_Prediction.py index bd9c6e29e6784898059c36e0dc382088efe91ba8..147594cedfb197bd34483908ab891469207cac9e 100644 --- a/Topological_ML/TDA_Prediction.py +++ b/Topological_ML/TDA_Prediction.py @@ -1,28 +1,23 @@ -def dataload(): +def numpy_to_pandas(sklearn_data): """ - upload toy datasets from scikit-learn + Converts scikit-learn numpy data into pandas dataframe. + Input: name of dataframe + Output: pandas dataframe """ - data = None - return data + df = pd.DataFrame(data = sklearn_data.data, columns = sklearn_data.feature_names) + df['response'] = pd.Series(sklearn_data.target) + return df -def datafetch(file_name): - """ - upload real world datasets from scikit-learn - """ - data = None - print("reading data from:", file_name) - return data - -def descriptive_statistic(df): +def descriptive_statistic(df, n): """ Provides brief descriptive statistics on dataset. - Takes dataframe as input. + Input: df = dataframe + n = the first n rows of the dataframe + Output: shape, head, and descriptive statistics of dataframe """ - print("Type : ", None, "\n\n") - print("Shape : ", None) - print("Head -- \n", None) - print("\n\n Tail -- \n", None) - print("Describe : ", None) + print("Shape : ", df.shape) + print("Head -- \n", df.head(n)) + print("Describe : ", df.describe()) def model_selection(df): """ @@ -61,4 +56,11 @@ def accuracy_metrics(fit, MSE): d['BIC'] = None d['PRESS'] = None d['Cp']= None - return d \ No newline at end of file + return None + + +def mysqrt(n): + if n < 0: + n = 1.5*abs(n) + sqrt1 = n**(1/2) + return sqrt1 \ No newline at end of file