made changes to prediction functions

5a63479a · shawk masboob · fe6863e6 · 5a63479a · 5a63479a
Commit 5a63479a authored 5 years ago by shawk masboob
--- a/TDA_Prediction.ipynb
+++ b/TDA_Prediction.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h1><center>Prediction using Topological Data Analysis</center></h1>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Give brief overview of notebook's purpose.\n",
+    "Also maybe add cool picture."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Topological_ML import TDA_Prediction as tdap\n",
+    "from sklearn.datasets import fetch_california_housing\n",
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cal_housing = fetch_california_housing()\n",
+    "\n",
+    "def numpy_to_pandas(sklearn_data):\n",
+    "    df = pd.DataFrame(data = sklearn_data.data, columns = sklearn_data.feature_names)\n",
+    "    df['response'] = pd.Series(sklearn_data.target)\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = numpy_to_pandas(cal_housing)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Shape :  (20640, 9)\n",
+      "Head -- \n",
+      "    MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \\\n",
+      "0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   \n",
+      "1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   \n",
+      "2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   \n",
+      "3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   \n",
+      "4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   \n",
+      "\n",
+      "   Longitude  target  \n",
+      "0    -122.23   4.526  \n",
+      "1    -122.22   3.585  \n",
+      "2    -122.24   3.521  \n",
+      "3    -122.25   3.413  \n",
+      "4    -122.25   3.422  \n",
+      "Describe :               MedInc      HouseAge      AveRooms     AveBedrms    Population  \\\n",
+      "count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   \n",
+      "mean       3.870671     28.639486      5.429000      1.096675   1425.476744   \n",
+      "std        1.899822     12.585558      2.474173      0.473911   1132.462122   \n",
+      "min        0.499900      1.000000      0.846154      0.333333      3.000000   \n",
+      "25%        2.563400     18.000000      4.440716      1.006079    787.000000   \n",
+      "50%        3.534800     29.000000      5.229129      1.048780   1166.000000   \n",
+      "75%        4.743250     37.000000      6.052381      1.099526   1725.000000   \n",
+      "max       15.000100     52.000000    141.909091     34.066667  35682.000000   \n",
+      "\n",
+      "           AveOccup      Latitude     Longitude        target  \n",
+      "count  20640.000000  20640.000000  20640.000000  20640.000000  \n",
+      "mean       3.070655     35.631861   -119.569704      2.068558  \n",
+      "std       10.386050      2.135952      2.003532      1.153956  \n",
+      "min        0.692308     32.540000   -124.350000      0.149990  \n",
+      "25%        2.429741     33.930000   -121.800000      1.196000  \n",
+      "50%        2.818116     34.260000   -118.490000      1.797000  \n",
+      "75%        3.282261     37.710000   -118.010000      2.647250  \n",
+      "max     1243.333333     41.950000   -114.310000      5.000010  \n",
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "def descriptive_statistic(df, n):\n",
+    "    \"\"\"\n",
+    "    Provides brief descriptive statistics on dataset. \n",
+    "    Takes dataframe as input.\n",
+    "    \"\"\"\n",
+    "    print(\"Shape : \", df.shape)\n",
+    "    print(\"Head -- \\n\", df.head(n))\n",
+    "    print(\"Describe : \", df.describe())\n",
+    "    \n",
+    "descriptive_statistic(df, 5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test_srt_rt1():\n",
+    "    assert tdap.mysqrt(-4) == 2\n",
+    "    return\n",
+    "\n",
+    "def test_srt_rt2():\n",
+    "    assert tdap.mysqrt(4) == 2\n",
+    "    return\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:markdown id: tags:
+
+<h1><center>Prediction using Topological Data Analysis</center></h1>
+
+%% Cell type:markdown id: tags:
+
+Give brief overview of notebook's purpose.
+Also maybe add cool picture.
+
+%% Cell type:code id: tags:
+
+``` python
+from Topological_ML import TDA_Prediction as tdap
+from sklearn.datasets import fetch_california_housing
+import pandas as pd
+import numpy as np
+```
+
+%% Cell type:code id: tags:
+
+``` python
+cal_housing = fetch_california_housing()
+
+def numpy_to_pandas(sklearn_data):
+    df = pd.DataFrame(data = sklearn_data.data, columns = sklearn_data.feature_names)
+    df['response'] = pd.Series(sklearn_data.target)
+    return df
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df = numpy_to_pandas(cal_housing)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def descriptive_statistic(df, n):
+    """
+    Provides brief descriptive statistics on dataset.
+    Takes dataframe as input.
+    """
+    print("Shape : ", df.shape)
+    print("Head -- \n", df.head(n))
+    print("Describe : ", df.describe())
+
+descriptive_statistic(df, 5)
+```
+
+%% Output
+
+    Shape :  (20640, 9)
+    Head --
+        MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
+    0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88
+    1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86
+    2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85
+    3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85
+    4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85
+    
+       Longitude  target
+    0    -122.23   4.526
+    1    -122.22   3.585
+    2    -122.24   3.521
+    3    -122.25   3.413
+    4    -122.25   3.422
+    Describe :               MedInc      HouseAge      AveRooms     AveBedrms    Population  \
+    count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000
+    mean       3.870671     28.639486      5.429000      1.096675   1425.476744
+    std        1.899822     12.585558      2.474173      0.473911   1132.462122
+    min        0.499900      1.000000      0.846154      0.333333      3.000000
+    25%        2.563400     18.000000      4.440716      1.006079    787.000000
+    50%        3.534800     29.000000      5.229129      1.048780   1166.000000
+    75%        4.743250     37.000000      6.052381      1.099526   1725.000000
+    max       15.000100     52.000000    141.909091     34.066667  35682.000000
+    
+               AveOccup      Latitude     Longitude        target
+    count  20640.000000  20640.000000  20640.000000  20640.000000
+    mean       3.070655     35.631861   -119.569704      2.068558
+    std       10.386050      2.135952      2.003532      1.153956
+    min        0.692308     32.540000   -124.350000      0.149990
+    25%        2.429741     33.930000   -121.800000      1.196000
+    50%        2.818116     34.260000   -118.490000      1.797000
+    75%        3.282261     37.710000   -118.010000      2.647250
+    max     1243.333333     41.950000   -114.310000      5.000010
+    None
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def test_srt_rt1():
+    assert tdap.mysqrt(-4) == 2
+    return
+
+def test_srt_rt2():
+    assert tdap.mysqrt(4) == 2
+    return
+
+```
--- a/Topological_ML/TDA_Prediction.py
+++ b/Topological_ML/TDA_Prediction.py
-def dataload():
+def numpy_to_pandas(sklearn_data):
    """
-    upload toy datasets from scikit-learn
+    Converts scikit-learn numpy data into pandas dataframe.
+    Input: name of dataframe
+    Output: pandas dataframe
    """
-    data = None
-    return data
+    df = pd.DataFrame(data = sklearn_data.data, columns = sklearn_data.feature_names)
+    df['response'] = pd.Series(sklearn_data.target)
+    return df

-def datafetch(file_name):
-    """
-    upload real world datasets from scikit-learn
-    """
-    data = None
-    print("reading data from:", file_name)
-    return data
-
-def descriptive_statistic(df):
+def descriptive_statistic(df, n):
    """
    Provides brief descriptive statistics on dataset. 
-    Takes dataframe as input.
+    Input: df = dataframe 
+           n = the first n rows of the dataframe
+    Output: shape, head, and descriptive statistics of dataframe
    """
-    print("Type : ", None, "\n\n")
-    print("Shape : ", None)
-    print("Head -- \n", None)
-    print("\n\n Tail -- \n", None)
-    print("Describe : ", None)
+    print("Shape : ", df.shape)
+    print("Head -- \n", df.head(n))
+    print("Describe : ", df.describe())
    
 def model_selection(df):
    """
@@ -61,4 +56,11 @@ def accuracy_metrics(fit, MSE):
    d['BIC'] = None
    d['PRESS'] = None
    d['Cp']= None
-    return d
\ No newline at end of file
+    return None
+
+
+def mysqrt(n):
+    if n < 0:
+        n = 1.5*abs(n)
+    sqrt1 = n**(1/2)
+    return sqrt1
\ No newline at end of file