From f59783cbad3fd5c44471d0cf52344d2d6cc97080 Mon Sep 17 00:00:00 2001
From: Dirk Colbry <colbrydi@msu.edu>
Date: Mon, 1 Jan 2024 13:00:17 -0500
Subject: [PATCH] Adding in Mimesis data generator

---
 Mimesis.ipynb | 360 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 360 insertions(+)
 create mode 100644 Mimesis.ipynb

diff --git a/Mimesis.ipynb b/Mimesis.ipynb
new file mode 100644
index 0000000..974cdd2
--- /dev/null
+++ b/Mimesis.ipynb
@@ -0,0 +1,360 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6600a76c",
+   "metadata": {},
+   "source": [
+    "# Installing required packages "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae65adaf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -r ../requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9395405",
+   "metadata": {},
+   "source": [
+    "# Quickstart Guide to Mimesis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "85dd7d12",
+   "metadata": {},
+   "source": [
+    "## Generating Some Data\n",
+    "\n",
+    "\n",
+    "Now, let's generate some data using Mimesis! We will use the **Person** provider class from Mimesis to generate a simple table containing a set of Names, Ages, Genders, Academic Degrees, and Occupations and convert the data into a Pandas DataFrame."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "4c184360",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mimesis import Person\n",
+    "from mimesis.locales import Locale\n",
+    "from mimesis.enums import Gender\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "3d762ee0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "person = Person(Locale.EN)\n",
+    "\n",
+    "data_list = [{\n",
+    "    \"Name\": person.full_name(),\n",
+    "    \"Age\": person.age(),\n",
+    "    \"Gender\": person.gender(),\n",
+    "    \"Academic Degree\": person.academic_degree(),\n",
+    "    \"Occupation\": person.occupation(),\n",
+    "} for _ in range(100)]\n",
+    "\n",
+    "df = pd.DataFrame(data_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab22c620",
+   "metadata": {},
+   "source": [
+    "We will then view the first five rows of our generated data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "e5d118b2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>Gender</th>\n",
+       "      <th>Academic Degree</th>\n",
+       "      <th>Occupation</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Jessenia Roy</td>\n",
+       "      <td>31</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>PhD</td>\n",
+       "      <td>Maintenance Fitter</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Jasper Schultz</td>\n",
+       "      <td>21</td>\n",
+       "      <td>Other</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>Racehorse Groom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Micki Newton</td>\n",
+       "      <td>41</td>\n",
+       "      <td>Other</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>Foundry Worker</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Doreatha Adams</td>\n",
+       "      <td>39</td>\n",
+       "      <td>Fluid</td>\n",
+       "      <td>Master</td>\n",
+       "      <td>Kitchen Worker</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Milton Ford</td>\n",
+       "      <td>41</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>Bank Messenger</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             Name  Age  Gender Academic Degree          Occupation\n",
+       "0    Jessenia Roy   31  Female             PhD  Maintenance Fitter\n",
+       "1  Jasper Schultz   21   Other        Bachelor     Racehorse Groom\n",
+       "2    Micki Newton   41   Other        Bachelor      Foundry Worker\n",
+       "3  Doreatha Adams   39   Fluid          Master      Kitchen Worker\n",
+       "4     Milton Ford   41    Male        Bachelor      Bank Messenger"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d46fda7f",
+   "metadata": {},
+   "source": [
+    "This is just a small preview of what Mimesis can do in terms of data generation. If you want a more detailed guide, you can follow the very detailed instructions at: \n",
+    "\n",
+    "https://mimesis.name/en/master/getting_started.html"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b9a289e0",
+   "metadata": {},
+   "source": [
+    "# Quick Start Guide to Faker"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f309c13f",
+   "metadata": {},
+   "source": [
+    "### Set Up"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "cafac6cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from faker import Faker\n",
+    "import pandas as pd\n",
+    "\n",
+    "fake = Faker()\n",
+    "Faker.seed(0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bccf7509",
+   "metadata": {},
+   "source": [
+    "### Generating Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "ad9390db",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>job</th>\n",
+       "      <th>company</th>\n",
+       "      <th>ssn</th>\n",
+       "      <th>residence</th>\n",
+       "      <th>current_location</th>\n",
+       "      <th>blood_group</th>\n",
+       "      <th>website</th>\n",
+       "      <th>username</th>\n",
+       "      <th>name</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>address</th>\n",
+       "      <th>mail</th>\n",
+       "      <th>birthdate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Hydrogeologist</td>\n",
+       "      <td>Gomez, Wright and Chen</td>\n",
+       "      <td>653-68-0948</td>\n",
+       "      <td>5735 Farley Course\\nPort Daniel, OH 79871</td>\n",
+       "      <td>-48.693466</td>\n",
+       "      <td>O+</td>\n",
+       "      <td>https://wright.com/</td>\n",
+       "      <td>robinsondanny</td>\n",
+       "      <td>Michael Montgomery</td>\n",
+       "      <td>M</td>\n",
+       "      <td>Unit 4892 Box 6717\\nDPO AE 71770</td>\n",
+       "      <td>bramirez@gmail.com</td>\n",
+       "      <td>1943-01-04</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Hydrogeologist</td>\n",
+       "      <td>Gomez, Wright and Chen</td>\n",
+       "      <td>653-68-0948</td>\n",
+       "      <td>5735 Farley Course\\nPort Daniel, OH 79871</td>\n",
+       "      <td>59.532676</td>\n",
+       "      <td>O+</td>\n",
+       "      <td>http://welch-miller.com/</td>\n",
+       "      <td>robinsondanny</td>\n",
+       "      <td>Michael Montgomery</td>\n",
+       "      <td>M</td>\n",
+       "      <td>Unit 4892 Box 6717\\nDPO AE 71770</td>\n",
+       "      <td>bramirez@gmail.com</td>\n",
+       "      <td>1943-01-04</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              job                 company          ssn  \\\n",
+       "0  Hydrogeologist  Gomez, Wright and Chen  653-68-0948   \n",
+       "1  Hydrogeologist  Gomez, Wright and Chen  653-68-0948   \n",
+       "\n",
+       "                                   residence current_location blood_group  \\\n",
+       "0  5735 Farley Course\\nPort Daniel, OH 79871       -48.693466          O+   \n",
+       "1  5735 Farley Course\\nPort Daniel, OH 79871        59.532676          O+   \n",
+       "\n",
+       "                    website       username                name sex  \\\n",
+       "0       https://wright.com/  robinsondanny  Michael Montgomery   M   \n",
+       "1  http://welch-miller.com/  robinsondanny  Michael Montgomery   M   \n",
+       "\n",
+       "                            address                mail   birthdate  \n",
+       "0  Unit 4892 Box 6717\\nDPO AE 71770  bramirez@gmail.com  1943-01-04  \n",
+       "1  Unit 4892 Box 6717\\nDPO AE 71770  bramirez@gmail.com  1943-01-04  "
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "while True:\n",
+    "    try: \n",
+    "        df = pd.DataFrame(fake.profile())\n",
+    "    except ValueError:\n",
+    "        continue\n",
+    "    break\n",
+    "df.head()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab