Tabular-GAN-Project-5Y-INSA/compare_data.py

98 lines
3.7 KiB
Python

import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import norm
import ipywidgets as widgets
import matplotlib.pyplot as plt
import glob
from data_treatment import DataAtts
from IPython.display import display
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.tree import export_graphviz # Decision tree from sklearn
import pydotplus # Decision tree plotting
def compare_data (original_data, fake_data, size_of_fake, mode="save"):
dataAtts = DataAtts(original_data)
data = pd.read_csv(original_data)
fake_data = pd.read_csv(fake_data).tail(size_of_fake)
print(dataAtts.message, "\n")
print(dataAtts.values_names[0], round(data[dataAtts.class_name].value_counts()[0]/len(data) * 100,2), '% of the dataset')
print(dataAtts.values_names[1], round(data[dataAtts.class_name].value_counts()[1]/len(data) * 100,2), '% of the dataset')
classes = list(data)
for name in classes:
if name=="Unnamed: 32":
continue
plt.xlabel('Values')
plt.ylabel('Probability')
plt.title(name + " distribution")
real_dist = data[name].values
fake_dist = fake_data[name].values
plt.hist(real_dist, 50, density=True, alpha=0.5)
plt.hist(fake_dist, 50, density=True, alpha=0.5, facecolor='r')
if mode=="save":
plt.savefig('fake_data/'+ dataAtts.fname + "/"+name+'_distribution.png')
elif mode=="show":
plt.show()
plt.clf()
def create_comparing_table(original_data_name, fake_data_name):
dataAtts = DataAtts(original_data_name)
data = pd.read_csv(original_data_name)
fake_data = pd.read_csv(fake_data_name)
fake_data.loc[getattr(fake_data, dataAtts.class_name) >= 0.5, dataAtts.class_name] = 1
fake_data.loc[getattr(fake_data, dataAtts.class_name) < 0.5, dataAtts.class_name] = 0
# Creates the training set
training_data = [["original", data.head(int(data.shape[0]*0.7))]]
fake_name = "fake" + str(fake_data_name).split("/")[2][0]
training_data.append([fake_name, fake_data.head(int(fake_data.shape[0]*0.7))])
test = data.tail(int(data.shape[0]*0.3))
print("| Database \t| Proportion \t| Test Error \t|")
print("| ---------\t| ---------: \t| :--------- \t|")
for episode in training_data:
name = episode[0]
train = episode[1]
try:
positive=str(round(train[dataAtts.class_name].value_counts()[0]/len(train) * 100,2))
except:
positive="0"
try:
negative=str(round(train[dataAtts.class_name].value_counts()[1]/len(train) * 100,2))
except:
negative="0"
trainX = train.drop(dataAtts.class_name, 1)
testX = test.drop(dataAtts.class_name, 1)
y_train = train[dataAtts.class_name]
y_test = test[dataAtts.class_name]
#trainX = pd.get_dummies(trainX)
clf1 = DT(max_depth = 3, min_samples_leaf = 1)
clf1 = clf1.fit(trainX,y_train)
export_graphviz(clf1, out_file="models/tree.dot", feature_names=trainX.columns, class_names=["0","1"], filled=True, rounded=True)
g = pydotplus.graph_from_dot_file(path="models/tree.dot")
pred = clf1.predict_proba(testX)
if pred.shape[1] > 1:
pred = np.argmax(pred, axis=1)
else:
pred = pred.reshape((pred.shape[0]))
if negative=="0":
pred = pred-1
mse = round(((pred - y_test.values)**2).mean(axis=0), 4)
string="| " + name + " \t| " + positive + "/" + negative + " \t| " + str(mse) + " \t|"
print(string)