From f59783cbad3fd5c44471d0cf52344d2d6cc97080 Mon Sep 17 00:00:00 2001 From: Dirk Colbry <colbrydi@msu.edu> Date: Mon, 1 Jan 2024 13:00:17 -0500 Subject: [PATCH] Adding in Mimesis data generator --- Mimesis.ipynb | 360 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 360 insertions(+) create mode 100644 Mimesis.ipynb diff --git a/Mimesis.ipynb b/Mimesis.ipynb new file mode 100644 index 0000000..974cdd2 --- /dev/null +++ b/Mimesis.ipynb @@ -0,0 +1,360 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6600a76c", + "metadata": {}, + "source": [ + "# Installing required packages " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae65adaf", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -r ../requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "d9395405", + "metadata": {}, + "source": [ + "# Quickstart Guide to Mimesis" + ] + }, + { + "cell_type": "markdown", + "id": "85dd7d12", + "metadata": {}, + "source": [ + "## Generating Some Data\n", + "\n", + "\n", + "Now, let's generate some data using Mimesis! We will use the **Person** provider class from Mimesis to generate a simple table containing a set of Names, Ages, Genders, Academic Degrees, and Occupations and convert the data into a Pandas DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4c184360", + "metadata": {}, + "outputs": [], + "source": [ + "from mimesis import Person\n", + "from mimesis.locales import Locale\n", + "from mimesis.enums import Gender\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3d762ee0", + "metadata": {}, + "outputs": [], + "source": [ + "person = Person(Locale.EN)\n", + "\n", + "data_list = [{\n", + " \"Name\": person.full_name(),\n", + " \"Age\": person.age(),\n", + " \"Gender\": person.gender(),\n", + " \"Academic Degree\": person.academic_degree(),\n", + " \"Occupation\": person.occupation(),\n", + "} for _ in range(100)]\n", + "\n", + "df = pd.DataFrame(data_list)" + ] + }, + { + "cell_type": "markdown", + "id": "ab22c620", + "metadata": {}, + "source": [ + "We will then view the first five rows of our generated data." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e5d118b2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Name</th>\n", + " <th>Age</th>\n", + " <th>Gender</th>\n", + " <th>Academic Degree</th>\n", + " <th>Occupation</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Jessenia Roy</td>\n", + " <td>31</td>\n", + " <td>Female</td>\n", + " <td>PhD</td>\n", + " <td>Maintenance Fitter</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Jasper Schultz</td>\n", + " <td>21</td>\n", + " <td>Other</td>\n", + " <td>Bachelor</td>\n", + " <td>Racehorse Groom</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Micki Newton</td>\n", + " <td>41</td>\n", + " <td>Other</td>\n", + " <td>Bachelor</td>\n", + " <td>Foundry Worker</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Doreatha Adams</td>\n", + " <td>39</td>\n", + " <td>Fluid</td>\n", + " <td>Master</td>\n", + " <td>Kitchen Worker</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Milton Ford</td>\n", + " <td>41</td>\n", + " <td>Male</td>\n", + " <td>Bachelor</td>\n", + " <td>Bank Messenger</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Name Age Gender Academic Degree Occupation\n", + "0 Jessenia Roy 31 Female PhD Maintenance Fitter\n", + "1 Jasper Schultz 21 Other Bachelor Racehorse Groom\n", + "2 Micki Newton 41 Other Bachelor Foundry Worker\n", + "3 Doreatha Adams 39 Fluid Master Kitchen Worker\n", + "4 Milton Ford 41 Male Bachelor Bank Messenger" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "d46fda7f", + "metadata": {}, + "source": [ + "This is just a small preview of what Mimesis can do in terms of data generation. If you want a more detailed guide, you can follow the very detailed instructions at: \n", + "\n", + "https://mimesis.name/en/master/getting_started.html" + ] + }, + { + "cell_type": "markdown", + "id": "b9a289e0", + "metadata": {}, + "source": [ + "# Quick Start Guide to Faker" + ] + }, + { + "cell_type": "markdown", + "id": "f309c13f", + "metadata": {}, + "source": [ + "### Set Up" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "cafac6cb", + "metadata": {}, + "outputs": [], + "source": [ + "from faker import Faker\n", + "import pandas as pd\n", + "\n", + "fake = Faker()\n", + "Faker.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "bccf7509", + "metadata": {}, + "source": [ + "### Generating Data" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ad9390db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>job</th>\n", + " <th>company</th>\n", + " <th>ssn</th>\n", + " <th>residence</th>\n", + " <th>current_location</th>\n", + " <th>blood_group</th>\n", + " <th>website</th>\n", + " <th>username</th>\n", + " <th>name</th>\n", + " <th>sex</th>\n", + " <th>address</th>\n", + " <th>mail</th>\n", + " <th>birthdate</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Hydrogeologist</td>\n", + " <td>Gomez, Wright and Chen</td>\n", + " <td>653-68-0948</td>\n", + " <td>5735 Farley Course\\nPort Daniel, OH 79871</td>\n", + " <td>-48.693466</td>\n", + " <td>O+</td>\n", + " <td>https://wright.com/</td>\n", + " <td>robinsondanny</td>\n", + " <td>Michael Montgomery</td>\n", + " <td>M</td>\n", + " <td>Unit 4892 Box 6717\\nDPO AE 71770</td>\n", + " <td>bramirez@gmail.com</td>\n", + " <td>1943-01-04</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Hydrogeologist</td>\n", + " <td>Gomez, Wright and Chen</td>\n", + " <td>653-68-0948</td>\n", + " <td>5735 Farley Course\\nPort Daniel, OH 79871</td>\n", + " <td>59.532676</td>\n", + " <td>O+</td>\n", + " <td>http://welch-miller.com/</td>\n", + " <td>robinsondanny</td>\n", + " <td>Michael Montgomery</td>\n", + " <td>M</td>\n", + " <td>Unit 4892 Box 6717\\nDPO AE 71770</td>\n", + " <td>bramirez@gmail.com</td>\n", + " <td>1943-01-04</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " job company ssn \\\n", + "0 Hydrogeologist Gomez, Wright and Chen 653-68-0948 \n", + "1 Hydrogeologist Gomez, Wright and Chen 653-68-0948 \n", + "\n", + " residence current_location blood_group \\\n", + "0 5735 Farley Course\\nPort Daniel, OH 79871 -48.693466 O+ \n", + "1 5735 Farley Course\\nPort Daniel, OH 79871 59.532676 O+ \n", + "\n", + " website username name sex \\\n", + "0 https://wright.com/ robinsondanny Michael Montgomery M \n", + "1 http://welch-miller.com/ robinsondanny Michael Montgomery M \n", + "\n", + " address mail birthdate \n", + "0 Unit 4892 Box 6717\\nDPO AE 71770 bramirez@gmail.com 1943-01-04 \n", + "1 Unit 4892 Box 6717\\nDPO AE 71770 bramirez@gmail.com 1943-01-04 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "while True:\n", + " try: \n", + " df = pd.DataFrame(fake.profile())\n", + " except ValueError:\n", + " continue\n", + " break\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab