diff --git a/tpot_tutorial.ipynb b/tpot_tutorial.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7e28f42e8f6f5aa231b349623253761ecdb3d430 --- /dev/null +++ b/tpot_tutorial.ipynb @@ -0,0 +1,958 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "29cb46d7", + "metadata": {}, + "source": [ + "Install command\n", + "%pip install tpot" + ] + }, + { + "cell_type": "markdown", + "id": "60aa22cd", + "metadata": {}, + "source": [ + "What is TPOT?\n", + "* TPOT is an automated machine learning tool that utilizes genetic programming to optimize machine learning pipelines. It is essentially an assistant for tree based pipeline optimization. <br/>\n", + "<br/>\n", + "\n", + "What/Who is it good for? \n", + "* TPOT rids you of having to do the most tedious portion of machine learning. It does this by exploring many varieties of pipelines and finding the best one for the data you are working with. \n", + "* This AutoML tool is an unbeatable asset and is a real bargain if you want to get a classification accuracy which is very competitive. Over and above that, this tool can identify artificial feature constructors that can enhance the classification accuracy in a very demanding way by identifying novel pipeline operators. The operators of TPOT are chained together to develop a series of operations acting on the given dataset <br/>\n", + "<br/>\n", + "\n", + "How to Install\n", + "* We used the call `pip install tpot` in order to install TPOT. It is noted to have PyTorch installed as well, but it is not necessary. The installation for PyTorch is `conda install pytorch` <br/>\n", + "<br/>\n", + "\n", + "Remain target to class – important step" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "022ab1ce", + "metadata": {}, + "outputs": [], + "source": [ + "from tpot import TPOTClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.datasets import load_iris\n", + "import pandas as pd \n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2898aaa9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([[5.1, 3.5, 1.4, 0.2],\n", + " [4.9, 3. , 1.4, 0.2],\n", + " [4.7, 3.2, 1.3, 0.2],\n", + " [4.6, 3.1, 1.5, 0.2],\n", + " [5. , 3.6, 1.4, 0.2]]),\n", + " array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n", + " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n", + " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris = load_iris()\n", + "iris.data[0:5], iris.target" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1d1ac27a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((112, 4), (38, 4), (112,), (38,))" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, train_size=0.75, test_size=0.25)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "015515d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: xgboost.XGBClassifier is not available and will not be used by TPOT.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "31f16c0878af4057a2347e6042c56a75", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Optimization Progress: 0%| | 0/100 [00:00<?, ?pipeline/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2.41 minutes have elapsed. TPOT will close down.\n", + "TPOT closed during evaluation in one generation.\n", + "WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.\n", + "\n", + "\n", + "TPOT closed prematurely. Will use the current best pipeline.\n", + "\n", + "Best pipeline: MLPClassifier(input_matrix, alpha=0.001, learning_rate_init=0.001)\n", + "0.9736842105263158\n" + ] + } + ], + "source": [ + "tpot = TPOTClassifier(verbosity=2, max_time_mins=2)\n", + "tpot.fit(X_train, y_train)\n", + "print(tpot.score(X_test, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fedcae2c", + "metadata": {}, + "outputs": [], + "source": [ + "tpot.export('tpot_iris_pipeline.py')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f2ac0eda", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>PassengerId</th>\n", + " <th>Survived</th>\n", + " <th>Pclass</th>\n", + " <th>Name</th>\n", + " <th>Sex</th>\n", + " <th>Age</th>\n", + " <th>SibSp</th>\n", + " <th>Parch</th>\n", + " <th>Ticket</th>\n", + " <th>Fare</th>\n", + " <th>Cabin</th>\n", + " <th>Embarked</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>3</td>\n", + " <td>Braund, Mr. Owen Harris</td>\n", + " <td>male</td>\n", + " <td>22.0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>A/5 21171</td>\n", + " <td>7.2500</td>\n", + " <td>NaN</td>\n", + " <td>S</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", + " <td>female</td>\n", + " <td>38.0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>PC 17599</td>\n", + " <td>71.2833</td>\n", + " <td>C85</td>\n", + " <td>C</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>Heikkinen, Miss. Laina</td>\n", + " <td>female</td>\n", + " <td>26.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>STON/O2. 3101282</td>\n", + " <td>7.9250</td>\n", + " <td>NaN</td>\n", + " <td>S</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", + " <td>female</td>\n", + " <td>35.0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>113803</td>\n", + " <td>53.1000</td>\n", + " <td>C123</td>\n", + " <td>S</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>5</td>\n", + " <td>0</td>\n", + " <td>3</td>\n", + " <td>Allen, Mr. William Henry</td>\n", + " <td>male</td>\n", + " <td>35.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>373450</td>\n", + " <td>8.0500</td>\n", + " <td>NaN</td>\n", + " <td>S</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic = pd.read_csv('titanic_train.csv')\n", + "titanic.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "30eeb3aa", + "metadata": {}, + "outputs": [], + "source": [ + "#rename target to class\n", + "titanic.rename(columns={'Survived': 'class'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "bcc561a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of levels in category 'Name': \b 891.00 \n", + "Number of levels in category 'Sex': \b 2.00 \n", + "Number of levels in category 'Ticket': \b 681.00 \n", + "Number of levels in category 'Cabin': \b 148.00 \n", + "Number of levels in category 'Embarked': \b 4.00 \n" + ] + } + ], + "source": [ + "for cat in ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']:\n", + " print(\"Number of levels in category '{0}': \\b {1:2.2f} \".format(cat, titanic[cat].unique().size))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "5be7251f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Levels for catgeory 'Sex': ['male' 'female']\n", + "Levels for catgeory 'Embarked': ['S' 'C' 'Q' nan]\n" + ] + } + ], + "source": [ + "for cat in ['Sex', 'Embarked']:\n", + " print(\"Levels for catgeory '{0}': {1}\".format(cat, titanic[cat].unique()))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8bb50fa8", + "metadata": {}, + "outputs": [], + "source": [ + "titanic['Sex'] = titanic['Sex'].map({'male':0,'female':1})\n", + "titanic['Embarked'] = titanic['Embarked'].map({'S':0,'C':1,'Q':2})" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "11c06d01", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PassengerId False\n", + "class False\n", + "Pclass False\n", + "Name False\n", + "Sex False\n", + "Age False\n", + "SibSp False\n", + "Parch False\n", + "Ticket False\n", + "Fare False\n", + "Cabin False\n", + "Embarked False\n", + "dtype: bool" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic = titanic.fillna(-999)\n", + "pd.isnull(titanic).any()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "aa017a2a", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import MultiLabelBinarizer\n", + "mlb = MultiLabelBinarizer()\n", + "CabinTrans = mlb.fit_transform([{str(val)} for val in titanic['Cabin'].values])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "34567d07", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [1, 0, 0, ..., 0, 0, 0],\n", + " ...,\n", + " [1, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [1, 0, 0, ..., 0, 0, 0]])" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "CabinTrans" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f31db644", + "metadata": {}, + "outputs": [], + "source": [ + "titanic_new = titanic.drop(['Name','Ticket','Cabin','class'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e8ccd33c", + "metadata": {}, + "outputs": [], + "source": [ + "assert (len(titanic['Cabin'].unique()) == len(mlb.classes_)), \"Not Equal\" #check correct encoding done" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "594776f6", + "metadata": {}, + "outputs": [], + "source": [ + "titanic_new = np.hstack((titanic_new.values,CabinTrans))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e8d47aef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.isnan(titanic_new).any()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "6fb6b7df", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "156" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_new[0].size" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "09fe6803", + "metadata": {}, + "outputs": [], + "source": [ + "titanic_class = titanic['class'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "a14bb5fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(668, 223)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_indices, validation_indices = training_indices, testing_indices = train_test_split(titanic.index, stratify = titanic_class, train_size=0.75, test_size=0.25)\n", + "training_indices.size, validation_indices.size" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "227863a0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: xgboost.XGBClassifier is not available and will not be used by TPOT.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Optimization Progress: 0%| | 0/40 [00:00<?, ?pipeline/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Generation 1 - Current best internal CV score: 0.8023903041185052\n", + "\n", + "2.03 minutes have elapsed. TPOT will close down.\n", + "TPOT closed during evaluation in one generation.\n", + "WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.\n", + "\n", + "\n", + "TPOT closed prematurely. Will use the current best pipeline.\n", + "\n", + "Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=6, max_features=0.3, min_samples_leaf=20, min_samples_split=15, n_estimators=100, subsample=0.45)\n" + ] + }, + { + "data": { + "text/plain": [ + "TPOTClassifier(max_eval_time_mins=0.04, max_time_mins=2, population_size=40,\n", + " verbosity=2)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tpot = TPOTClassifier(verbosity=2, max_time_mins=2, max_eval_time_mins=0.04, population_size=40)\n", + "tpot.fit(titanic_new[training_indices], titanic_class[training_indices])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "ca18f35b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8251121076233184" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tpot.score(titanic_new[validation_indices], titanic.loc[validation_indices, 'class'].values)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "4d710e07", + "metadata": {}, + "outputs": [], + "source": [ + "tpot.export('tpot_titanic_pipeline.py')" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "d7ff45ed", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>PassengerId</th>\n", + " <th>Pclass</th>\n", + " <th>Age</th>\n", + " <th>SibSp</th>\n", + " <th>Parch</th>\n", + " <th>Fare</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>418.000000</td>\n", + " <td>418.000000</td>\n", + " <td>332.000000</td>\n", + " <td>418.000000</td>\n", + " <td>418.000000</td>\n", + " <td>417.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>1100.500000</td>\n", + " <td>2.265550</td>\n", + " <td>30.272590</td>\n", + " <td>0.447368</td>\n", + " <td>0.392344</td>\n", + " <td>35.627188</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>120.810458</td>\n", + " <td>0.841838</td>\n", + " <td>14.181209</td>\n", + " <td>0.896760</td>\n", + " <td>0.981429</td>\n", + " <td>55.907576</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>892.000000</td>\n", + " <td>1.000000</td>\n", + " <td>0.170000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>996.250000</td>\n", + " <td>1.000000</td>\n", + " <td>21.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>7.895800</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>1100.500000</td>\n", + " <td>3.000000</td>\n", + " <td>27.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>14.454200</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>1204.750000</td>\n", + " <td>3.000000</td>\n", + " <td>39.000000</td>\n", + " <td>1.000000</td>\n", + " <td>0.000000</td>\n", + " <td>31.500000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>1309.000000</td>\n", + " <td>3.000000</td>\n", + " <td>76.000000</td>\n", + " <td>8.000000</td>\n", + " <td>9.000000</td>\n", + " <td>512.329200</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " PassengerId Pclass Age SibSp Parch Fare\n", + "count 418.000000 418.000000 332.000000 418.000000 418.000000 417.000000\n", + "mean 1100.500000 2.265550 30.272590 0.447368 0.392344 35.627188\n", + "std 120.810458 0.841838 14.181209 0.896760 0.981429 55.907576\n", + "min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000\n", + "25% 996.250000 1.000000 21.000000 0.000000 0.000000 7.895800\n", + "50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200\n", + "75% 1204.750000 3.000000 39.000000 1.000000 0.000000 31.500000\n", + "max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_sub = pd.read_csv('titanic_test.csv')\n", + "titanic_sub.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "8264d2ed", + "metadata": {}, + "outputs": [], + "source": [ + "for var in ['Cabin']: #,'Name','Ticket']:\n", + " new = list(set(titanic_sub[var]) - set(titanic[var]))\n", + " titanic_sub.loc[titanic_sub[var].isin(new), var] = -999" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "fe8198e3", + "metadata": {}, + "outputs": [], + "source": [ + "titanic_sub['Sex'] = titanic_sub['Sex'].map({'male':0,'female':1})\n", + "titanic_sub['Embarked'] = titanic_sub['Embarked'].map({'S':0,'C':1,'Q':2})" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "13204313", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PassengerId False\n", + "Pclass False\n", + "Name False\n", + "Sex False\n", + "Age False\n", + "SibSp False\n", + "Parch False\n", + "Ticket False\n", + "Fare False\n", + "Cabin False\n", + "Embarked False\n", + "dtype: bool" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_sub = titanic_sub.fillna(-999)\n", + "pd.isnull(titanic_sub).any()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "82e8d3fb", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import MultiLabelBinarizer\n", + "mlb = MultiLabelBinarizer()\n", + "SubCabinTrans = mlb.fit([{str(val)} for val in titanic['Cabin'].values]).transform([{str(val)} for val in titanic_sub['Cabin'].values])\n", + "titanic_sub = titanic_sub.drop(['Name','Ticket','Cabin'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "185ba8c1", + "metadata": {}, + "outputs": [], + "source": [ + "titanic_sub_new = np.hstack((titanic_sub.values,SubCabinTrans))" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "359c8b6b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.any(np.isnan(titanic_sub_new))" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "e73a0c32", + "metadata": {}, + "outputs": [], + "source": [ + "assert (titanic_new.shape[1] == titanic_sub_new.shape[1]), \"Not Equal\" " + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "d868e452", + "metadata": {}, + "outputs": [], + "source": [ + "submission = tpot.predict(titanic_sub_new)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "3d91d737", + "metadata": {}, + "outputs": [], + "source": [ + "final = pd.DataFrame({'PassengerId': titanic_sub['PassengerId'], 'Survived': submission})\n", + "final.to_csv('data/submission.csv', index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "240feb73", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(418, 2)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final.shape" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}