Tabular-GAN-Project-5Y-INSA/fake_data_analysis.ipynb

128 lines
3.6 KiB
Plaintext
Raw Permalink Normal View History

2023-01-07 06:30:24 +00:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fake Data Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.stats import norm\n",
"from data_treatment import DataAtts\n",
"import ipywidgets as widgets\n",
"import matplotlib.pyplot as plt\n",
"import glob\n",
"from compare_data import *\n",
"from IPython.display import display"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"folder_name = 'original_data/diabetes.csv'[14:-4]\n",
"fake_files_dropdown = widgets.Dropdown(\n",
" options=glob.glob(\"fake_data/\" + folder_name + \"/*.csv\"),\n",
" description='Fake file:',\n",
" disabled=False,\n",
")\n",
"display(fake_files_dropdown)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file_name='original_data/diabetes.csv'\n",
"dataAtts = DataAtts(file_name)\n",
" \n",
" \n",
"data = pd.read_csv(file_name)\n",
"fake_data = pd.read_csv(fake_files_dropdown.value)\n",
"fake_data.loc[getattr(fake_data, dataAtts.class_name) >= 0.5, dataAtts.class_name] = 1\n",
"fake_data.loc[getattr(fake_data, dataAtts.class_name) < 0.5, dataAtts.class_name] = 0\n",
"\n",
"print(dataAtts.message)\n",
"print(dataAtts.values_names[0], round(data[dataAtts.class_name].value_counts()[0]/len(data) * 100,2), '% of the dataset')\n",
"print(dataAtts.values_names[1], round(data[dataAtts.class_name].value_counts()[1]/len(data) * 100,2), '% of the dataset')\n",
"\n",
"print(\"\\nFake Data\")\n",
"try:\n",
" positive=str(round(fake_data[dataAtts.class_name].value_counts()[0]/len(fake_data) * 100,2))\n",
"except:\n",
" positive=\"0\"\n",
"try:\n",
" negative=str(round(fake_data[dataAtts.class_name].value_counts()[1]/len(fake_data) * 100,2))\n",
"except:\n",
" negative=\"0\"\n",
" \n",
"\n",
"print(\"Outcome = 0: \", positive, '% of the dataset')\n",
"print(\"Outcome = 1: \", negative, '% of the dataset')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"classes = list(data)\n",
"\n",
"for name in classes:\n",
" if name==\"Unnamed: 32\":\n",
" continue\n",
" \n",
" plt.xlabel('Values')\n",
" plt.ylabel('Probability')\n",
" plt.title(name + \" distribution\")\n",
" real_dist = data[name].values\n",
" fake_dist = fake_data[name].values\n",
" plt.hist(real_dist, 50, density=True, alpha=0.5)\n",
" plt.hist(fake_dist, 50, density=True, alpha=0.5, facecolor='r')\n",
" #plt.savefig('fake_data/'+ dataAtts.fname + \"/\"+name+'_distribution.png')\n",
" plt.show()\n",
" plt.clf()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "tabular_gan",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8 | packaged by conda-forge | (main, Nov 24 2022, 14:07:00) [MSC v.1916 64 bit (AMD64)]"
},
"vscode": {
"interpreter": {
"hash": "2f1136a7f15cd1225735fd9261403f7c342baa42a12d30e4630e4cfef11f2512"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}