98 lines
3.7 KiB
Python
98 lines
3.7 KiB
Python
import pandas as pd
|
|
import seaborn as sns
|
|
import numpy as np
|
|
from scipy.stats import norm
|
|
import ipywidgets as widgets
|
|
import matplotlib.pyplot as plt
|
|
import glob
|
|
from data_treatment import DataAtts
|
|
from IPython.display import display
|
|
from sklearn.tree import DecisionTreeClassifier as DT
|
|
from sklearn.tree import export_graphviz # Decision tree from sklearn
|
|
import pydotplus # Decision tree plotting
|
|
|
|
def compare_data (original_data, fake_data, size_of_fake, mode="save"):
|
|
dataAtts = DataAtts(original_data)
|
|
|
|
data = pd.read_csv(original_data)
|
|
fake_data = pd.read_csv(fake_data).tail(size_of_fake)
|
|
print(dataAtts.message, "\n")
|
|
print(dataAtts.values_names[0], round(data[dataAtts.class_name].value_counts()[0]/len(data) * 100,2), '% of the dataset')
|
|
print(dataAtts.values_names[1], round(data[dataAtts.class_name].value_counts()[1]/len(data) * 100,2), '% of the dataset')
|
|
|
|
classes = list(data)
|
|
|
|
for name in classes:
|
|
if name=="Unnamed: 32":
|
|
continue
|
|
|
|
plt.xlabel('Values')
|
|
plt.ylabel('Probability')
|
|
plt.title(name + " distribution")
|
|
real_dist = data[name].values
|
|
fake_dist = fake_data[name].values
|
|
plt.hist(real_dist, 50, density=True, alpha=0.5)
|
|
plt.hist(fake_dist, 50, density=True, alpha=0.5, facecolor='r')
|
|
if mode=="save":
|
|
plt.savefig('fake_data/'+ dataAtts.fname + "/"+name+'_distribution.png')
|
|
elif mode=="show":
|
|
plt.show()
|
|
plt.clf()
|
|
|
|
def create_comparing_table(original_data_name, fake_data_name):
|
|
|
|
dataAtts = DataAtts(original_data_name)
|
|
data = pd.read_csv(original_data_name)
|
|
fake_data = pd.read_csv(fake_data_name)
|
|
fake_data.loc[getattr(fake_data, dataAtts.class_name) >= 0.5, dataAtts.class_name] = 1
|
|
fake_data.loc[getattr(fake_data, dataAtts.class_name) < 0.5, dataAtts.class_name] = 0
|
|
|
|
# Creates the training set
|
|
training_data = [["original", data.head(int(data.shape[0]*0.7))]]
|
|
fake_name = "fake" + str(fake_data_name).split("/")[2][0]
|
|
training_data.append([fake_name, fake_data.head(int(fake_data.shape[0]*0.7))])
|
|
|
|
test = data.tail(int(data.shape[0]*0.3))
|
|
|
|
print("| Database \t| Proportion \t| Test Error \t|")
|
|
print("| ---------\t| ---------: \t| :--------- \t|")
|
|
|
|
for episode in training_data:
|
|
name = episode[0]
|
|
train = episode[1]
|
|
try:
|
|
positive=str(round(train[dataAtts.class_name].value_counts()[0]/len(train) * 100,2))
|
|
except:
|
|
positive="0"
|
|
try:
|
|
negative=str(round(train[dataAtts.class_name].value_counts()[1]/len(train) * 100,2))
|
|
except:
|
|
negative="0"
|
|
|
|
|
|
trainX = train.drop(dataAtts.class_name, 1)
|
|
testX = test.drop(dataAtts.class_name, 1)
|
|
y_train = train[dataAtts.class_name]
|
|
y_test = test[dataAtts.class_name]
|
|
#trainX = pd.get_dummies(trainX)
|
|
|
|
clf1 = DT(max_depth = 3, min_samples_leaf = 1)
|
|
clf1 = clf1.fit(trainX,y_train)
|
|
export_graphviz(clf1, out_file="models/tree.dot", feature_names=trainX.columns, class_names=["0","1"], filled=True, rounded=True)
|
|
g = pydotplus.graph_from_dot_file(path="models/tree.dot")
|
|
|
|
pred = clf1.predict_proba(testX)
|
|
if pred.shape[1] > 1:
|
|
pred = np.argmax(pred, axis=1)
|
|
else:
|
|
pred = pred.reshape((pred.shape[0]))
|
|
if negative=="0":
|
|
pred = pred-1
|
|
|
|
mse = round(((pred - y_test.values)**2).mean(axis=0), 4)
|
|
|
|
string="| " + name + " \t| " + positive + "/" + negative + " \t| " + str(mse) + " \t|"
|
|
print(string)
|
|
|
|
|
|
|