{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Fake Data Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from scipy.stats import norm\n", "from data_treatment import DataAtts\n", "import ipywidgets as widgets\n", "import matplotlib.pyplot as plt\n", "import glob\n", "from compare_data import *\n", "from IPython.display import display" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "folder_name = 'original_data/diabetes.csv'[14:-4]\n", "fake_files_dropdown = widgets.Dropdown(\n", " options=glob.glob(\"fake_data/\" + folder_name + \"/*.csv\"),\n", " description='Fake file:',\n", " disabled=False,\n", ")\n", "display(fake_files_dropdown)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file_name='original_data/diabetes.csv'\n", "dataAtts = DataAtts(file_name)\n", " \n", " \n", "data = pd.read_csv(file_name)\n", "fake_data = pd.read_csv(fake_files_dropdown.value)\n", "fake_data.loc[getattr(fake_data, dataAtts.class_name) >= 0.5, dataAtts.class_name] = 1\n", "fake_data.loc[getattr(fake_data, dataAtts.class_name) < 0.5, dataAtts.class_name] = 0\n", "\n", "print(dataAtts.message)\n", "print(dataAtts.values_names[0], round(data[dataAtts.class_name].value_counts()[0]/len(data) * 100,2), '% of the dataset')\n", "print(dataAtts.values_names[1], round(data[dataAtts.class_name].value_counts()[1]/len(data) * 100,2), '% of the dataset')\n", "\n", "print(\"\\nFake Data\")\n", "try:\n", " positive=str(round(fake_data[dataAtts.class_name].value_counts()[0]/len(fake_data) * 100,2))\n", "except:\n", " positive=\"0\"\n", "try:\n", " negative=str(round(fake_data[dataAtts.class_name].value_counts()[1]/len(fake_data) * 100,2))\n", "except:\n", " negative=\"0\"\n", " \n", "\n", "print(\"Outcome = 0: \", positive, '% of the dataset')\n", "print(\"Outcome = 1: \", negative, '% of the dataset')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classes = list(data)\n", "\n", "for name in classes:\n", " if name==\"Unnamed: 32\":\n", " continue\n", " \n", " plt.xlabel('Values')\n", " plt.ylabel('Probability')\n", " plt.title(name + \" distribution\")\n", " real_dist = data[name].values\n", " fake_dist = fake_data[name].values\n", " plt.hist(real_dist, 50, density=True, alpha=0.5)\n", " plt.hist(fake_dist, 50, density=True, alpha=0.5, facecolor='r')\n", " #plt.savefig('fake_data/'+ dataAtts.fname + \"/\"+name+'_distribution.png')\n", " plt.show()\n", " plt.clf()\n" ] } ], "metadata": { "kernelspec": { "display_name": "tabular_gan", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8 | packaged by conda-forge | (main, Nov 24 2022, 14:07:00) [MSC v.1916 64 bit (AMD64)]" }, "vscode": { "interpreter": { "hash": "2f1136a7f15cd1225735fd9261403f7c342baa42a12d30e4630e4cfef11f2512" } } }, "nbformat": 4, "nbformat_minor": 2 }