import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
import mlflow
import yellowbrick
from numpy import unique
from numpy import where
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, adjusted_rand_score, confusion_matrix
from sklearn.cluster import DBSCAN


from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklego.mixture import BayesianGMMClassifier

 is_kaggle=False
if is_kaggle is False:
    data = pd.read_csv('data.csv')
    sample_submission = pd.read_csv("Submissions/sample_submission.csv")
else:
    data = pd.read_csv("../input/tabular-playground-series-jul-2022/data.csv")
    sample_submission = pd.read_csv("../input/tabular-playground-series-jul-2022/sample_submission.csv")

 data.shape

(98000, 30)

 data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98000 entries, 0 to 97999
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      98000 non-null  int64  
 1   f_00    98000 non-null  float64
 2   f_01    98000 non-null  float64
 3   f_02    98000 non-null  float64
 4   f_03    98000 non-null  float64
 5   f_04    98000 non-null  float64
 6   f_05    98000 non-null  float64
 7   f_06    98000 non-null  float64
 8   f_07    98000 non-null  int64  
 9   f_08    98000 non-null  int64  
 10  f_09    98000 non-null  int64  
 11  f_10    98000 non-null  int64  
 12  f_11    98000 non-null  int64  
 13  f_12    98000 non-null  int64  
 14  f_13    98000 non-null  int64  
 15  f_14    98000 non-null  float64
 16  f_15    98000 non-null  float64
 17  f_16    98000 non-null  float64
 18  f_17    98000 non-null  float64
 19  f_18    98000 non-null  float64
 20  f_19    98000 non-null  float64
 21  f_20    98000 non-null  float64
 22  f_21    98000 non-null  float64
 23  f_22    98000 non-null  float64
 24  f_23    98000 non-null  float64
 25  f_24    98000 non-null  float64
 26  f_25    98000 non-null  float64
 27  f_26    98000 non-null  float64
 28  f_27    98000 non-null  float64
 29  f_28    98000 non-null  float64
dtypes: float64(22), int64(8)
memory usage: 22.4 MB

 data.describe()

 na_counter = data.isna().sum().sum()
null_counter = data.isnull().sum().sum()
print(f'NA: {na_counter} / NULL: {null_counter}')

NA: 0 / NULL: 0

 sns.set(rc={'figure.figsize':(20,24)})

 sns.heatmap(data.corr(),annot=True,fmt='.2f')

<AxesSubplot:>

 data.hist(bins = 50, figsize=(25, 25))
plt.show()

 data = data.drop(['id'], axis=1)
data.shape

(98000, 29)

 from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

def show_k_elbow_visualizer(data):
    elbow_visualizer = KElbowVisualizer(KMeans(), k=(4,13))
    elbow_visualizer.fit(data)
    elbow_visualizer.show()
    return elbow_visualizer.estimator,elbow_visualizer.elbow_value_

def show_silhouetteVisualizer(estimator,data):
    visualizer = SilhouetteVisualizer(estimator)
    visualizer.fit(data)
    visualizer.show()

 estimator, cluster_count = show_k_elbow_visualizer(data)

 from yellowbrick.cluster import InterclusterDistance

def show_interclusterdistance(data):
    # Instantiate the clustering model and visualizer
    model = KMeans(cluster_count)
    visualizer = InterclusterDistance(model)
    visualizer.fit(data)        # Fit the data to the visualizer
    visualizer.show()      # Finalize and render the figure

show_interclusterdistance(data)

 #Generate selected columns, section 1: columns [f7-14], columns section 2: [21-29]
cluster_section_1_column_list  = [f'f_{i:02d}' for i in range(7,14)]
cluster_section_2_column_list = [f'f_{i:02d}' for i in range(21,29)]
cluster_column_list = cluster_section_1_column_list + cluster_section_2_column_list

cluster_data = data[cluster_column_list]
scaled_cluster_data = PowerTransformer().fit_transform(cluster_data)
training_data = pd.DataFrame(scaled_cluster_data,columns=cluster_column_list)
#training_data = training_data.astype(float64)
training_data.hist(bins = 50, figsize=(25, 25))
plt.show()

 #show_k_elbow_visualizer(training_data)

 #show_interclusterdistance(training_data)

 # define the model
bgm_model = BayesianGaussianMixture(n_components=cluster_count,covariance_type='full',random_state=1)

# assign each data point to a cluster
bgm_model_predictions = pd.DataFrame(bgm_model.fit_predict(training_data))
bgm_model_proba = pd.DataFrame(bgm_model.predict_proba(training_data))
bgm_model_labels=unique(bgm_model_predictions)
# get all of the unique clusters
print(bgm_model_labels)

[0 1 2 3 4 5 6]

 #show_silhouetteVisualizer(KMeans(),training_data)

 def best_class(df):
    new_df = df.copy()
    new_df["highest_prob"] = df.max(axis=1)
    new_df["best_class"] = df.idxmax(axis=1)
    new_df["second_highest_prob"] = df.apply(lambda x: x.nlargest(2).values[-1], axis=1)
    new_df["second_best_class"] = df.apply(lambda x: np.where(x == x.nlargest(2).values[-1])[0][0]+1, axis=1)
    #new_df["class_differ"] = new_df["highest_prob"]- new_df["second_highest_prob"]
    return new_df

 bgm_model_proba_df=best_class(bgm_model_proba)

 bgm_model_proba_df.sample(5)

 training_data_x = pd.concat([training_data,bgm_model_proba_df[['best_class','highest_prob']]], axis=1)
training_data_x.head()

 confident_predictions = training_data_x.loc[training_data_x["highest_prob"] >= 0.85] #Training
non_confident_predictions = training_data_x.loc[training_data_x["highest_prob"] < 0.85] #Predict

print(f'confident: {len(confident_predictions)}')
print(f'non-confident:{len(non_confident_predictions)}')
print(f'total = {len(non_confident_predictions+confident_predictions)}')

confident: 56455
non-confident:41545
total = 98000

 from yellowbrick.target import class_balance

def show_cluster_class_balance(results):
    sns.set(rc={'figure.figsize':(5,5)})
    visualizer = class_balance(results)

show_cluster_class_balance(confident_predictions['best_class'])

 from imblearn.over_sampling import SMOTE

oversample = SMOTE()
balanced_confident_predictions_Y = confident_predictions['best_class']
balanced_confident_predictions_X =confident_predictions.copy().drop(['best_class','highest_prob'],axis=1)
balanced_confident_predictions_X, balanced_confident_predictions_Y = oversample.fit_resample(balanced_confident_predictions_X,balanced_confident_predictions_Y )

 print(f'before: {len(confident_predictions)} - after: {len(balanced_confident_predictions_X)}')

before: 56455 - after: 71547

 show_cluster_class_balance(balanced_confident_predictions_Y)

 model_et = ExtraTreesClassifier(n_estimators = 2000,
                                n_jobs = -1,
                                random_state=42
                               )

model_lgbm = LGBMClassifier(objective = 'multiclass',
                            n_estimators = 5000,
                            random_state = 42,
                            learning_rate = 0.1,
                            n_jobs = -1)

model_qda = QuadraticDiscriminantAnalysis()
model_lda = LinearDiscriminantAnalysis()

model_bgmm = BayesianGMMClassifier(
            n_components=cluster_count,
            random_state = 42,
            tol =1e-3,
            covariance_type = 'full',
            max_iter = 400,
            n_init=4,
            init_params='kmeans')

models = {"ET":model_et, "LGBM":model_lgbm, "QDA":model_qda, "LDA":model_lda, "BGMM_C":model_bgmm}

def evaluate_models(training_data,label_data,verbose=False):
    for model_name, model in models.items():
        print("===",model_name,"===")
        feature_imp, y_pred_list, y_true_list, acc_list, X_val, y_val = k_fold_cv(model=model,X=training_data,y=label_data, verbose=verbose)
        acc_score = accuracy_score(y_pred_list, y_true_list)
        print("{0:0.4f}".format(acc_score))

def k_fold_cv(model,X,y, verbose=True):
    kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state = 0)

    feature_imp, y_pred_list, y_true_list, acc_list  = [],[],[],[]
    for fold, (train_index, val_index) in enumerate(kfold.split(X, y)):
        if verbose: print("==fold==", fold)
        X_train = X.loc[train_index]
        X_val = X.loc[val_index]

        y_train = y.loc[train_index]
        y_val = y.loc[val_index]

        model.fit(X_train,y_train)

        y_pred = model.predict(X_val)

        y_pred_list = np.append(y_pred_list, y_pred)
        y_true_list = np.append(y_true_list, y_val)

        acc_list.append(accuracy_score(y_pred, y_val))
        if verbose: print('Acc', accuracy_score(y_pred, y_val))
        try:
            feature_imp.append(model.feature_importances_)
        except AttributeError as e: # if model does not have .feature_importances_ attribute
            print(e)
            pass # returns empty list
            
    return feature_imp, y_pred_list, y_true_list, acc_list, X_val, y_val

 evaluate_models(balanced_confident_predictions_X,balanced_confident_predictions_Y)

=== ET ===
0.9925
=== LGBM ===
0.9980
=== QDA ===
'QuadraticDiscriminantAnalysis' object has no attribute 'feature_importances_'
'QuadraticDiscriminantAnalysis' object has no attribute 'feature_importances_'
'QuadraticDiscriminantAnalysis' object has no attribute 'feature_importances_'
'QuadraticDiscriminantAnalysis' object has no attribute 'feature_importances_'
'QuadraticDiscriminantAnalysis' object has no attribute 'feature_importances_'
0.9999
=== LDA ===
'LinearDiscriminantAnalysis' object has no attribute 'feature_importances_'
'LinearDiscriminantAnalysis' object has no attribute 'feature_importances_'
'LinearDiscriminantAnalysis' object has no attribute 'feature_importances_'
'LinearDiscriminantAnalysis' object has no attribute 'feature_importances_'
'LinearDiscriminantAnalysis' object has no attribute 'feature_importances_'
0.8547
=== BGMM_C ===
'BayesianGMMClassifier' object has no attribute 'feature_importances_'
'BayesianGMMClassifier' object has no attribute 'feature_importances_'
'BayesianGMMClassifier' object has no attribute 'feature_importances_'
'BayesianGMMClassifier' object has no attribute 'feature_importances_'
'BayesianGMMClassifier' object has no attribute 'feature_importances_'
0.9920

 non_confident_predictions_Y = pd.DataFrame(non_confident_predictions['best_class'] ,columns=['best_class']).reset_index(drop=True)
non_confident_predictions_X = non_confident_predictions.copy().drop(['best_class','highest_prob'],axis=1)
model_non_confident_predictions = pd.DataFrame(models['LGBM'].predict(non_confident_predictions_X),columns=['best_class'])
model_non_confident_predictions_probas = pd.DataFrame(models['LGBM'].predict_proba(non_confident_predictions_X))

 differ_df = model_non_confident_predictions.compare(non_confident_predictions_Y)
print(f'{len(differ_df)} of {len(non_confident_predictions_Y)}')

6966 of 41545

 differ_df.head()

 model_non_confident_predictions_probas = best_class(model_non_confident_predictions_probas)

 model_non_confident_predictions_probas.iloc[24]

0                      4.885802e-14
1                      4.400439e-13
2                      2.749495e-12
3                      2.810474e-12
4                      1.148052e-01
5                      8.851948e-01
6                      1.227265e-11
highest_prob           8.851948e-01
best_class             5.000000e+00
second_highest_prob    1.148052e-01
second_best_class      5.000000e+00
Name: 24, dtype: float64

 def score_clusters(X, predictions, silhouette = True, verbose=False):
    """Evaluate how good our cluster label predictions are"""
    
    db_score = davies_bouldin_score(X=X, labels=predictions)

    ch_score = calinski_harabasz_score(X=X, labels=predictions)
    #the silhouette score is the slowest to compute ~90 secs
    s_score = silhouette_score(X=X, labels=predictions, metric='euclidean')
    
    if verbose:
        print("David Bouldin score: {0:0.4f}".format(db_score))
        print("Calinski Harabasz score: {0:0.3f}".format(ch_score))
        print("Silhouette score: {0:0.4f}".format(s_score))
        
    return db_score, ch_score, s_score

def fit_predict_all(models,x):
    predictions = []
    predictions_proba = []
    model_names = []
    scores = []
    for model_name, model in models.items():
        print("===",model_name,"===")
        #model.set_params(warm_start=True)
        #model.fit(x,y)
        preds =  model.predict(x)
        preds_prob =  model.predict_proba(x)
        preds_prob_df = pd.DataFrame(preds_prob, columns=range(1,8))#, index=X_scaled.index)
        db, ch, s = score_clusters(x, preds_prob_df.idxmax(axis=1), verbose=True)
        scores.append((db,ch,s))
        predictions.append(preds)
        predictions_proba.append(best_class(preds_prob_df))
        model_names.append(model_name)
    return predictions,predictions_proba, model_names, scores

 Y=pd.DataFrame(training_data_x['best_class'],columns=['best_class'])
X=training_data_x.copy().drop(['best_class','highest_prob'],axis=1)
predictions,predictions_proba, model_names, scores = fit_predict_all(models,X)

=== ET ===
David Bouldin score: 3.5427
Calinski Harabasz score: 3713.525
Silhouette score: 0.0382
=== LGBM ===
David Bouldin score: 3.6965
Calinski Harabasz score: 3586.804
Silhouette score: 0.0347
=== QDA ===
David Bouldin score: 3.6544
Calinski Harabasz score: 3607.327
Silhouette score: 0.0346
=== LDA ===
David Bouldin score: 3.1453
Calinski Harabasz score: 4338.890
Silhouette score: 0.0532
=== BGMM_C ===
David Bouldin score: 3.7492
Calinski Harabasz score: 3518.418
Silhouette score: 0.0323

 for model_index in range(0,5):
    model_predictions = pd.DataFrame(predictions[model_index],columns=['best_class'])
    model_probas = predictions_proba[model_index]
    differ_df = model_predictions.compare(Y)
    model_confident_predictions = model_probas.loc[model_probas["highest_prob"] >= 0.85] 
    model_non_confident_predictions = model_probas.loc[model_probas["highest_prob"] <= 0.85] 
    print(f'{model_names[model_index]}: {len(differ_df)} Prediction differences of {len(training_data_x)} (Confident: {len(model_confident_predictions)}/{len(model_non_confident_predictions)} - {"{:.2%}".format(((len(model_confident_predictions)/len(training_data_x))))})')

ET: 10223 Prediction differences of 98000 (Confident: 50626/47389 - 51.66%)
LGBM: 6997 Prediction differences of 98000 (Confident: 91298/6702 - 93.16%)
QDA: 4683 Prediction differences of 98000 (Confident: 73841/24159 - 75.35%)
LDA: 25527 Prediction differences of 98000 (Confident: 42088/55912 - 42.95%)
BGMM_C: 10333 Prediction differences of 98000 (Confident: 83800/14200 - 85.51%)

 '''
predictions,predictions_proba, model_names, scores = fit_predict_all(models,non_confident_predictions_X)
for model_index in range(0,5):
    model_predictions = pd.DataFrame(predictions[model_index],columns=['best_class'])
    model_probas = predictions_proba[model_index]
    differ_df = model_predictions.compare(training_data_x['best_class'])
    model_confident_predictions = model_probas.loc[model_probas["highest_prob"] >= 0.85] 
    model_non_confident_predictions = model_probas.loc[model_probas["highest_prob"] <= 0.85] 
    print(f'{model_names[model_index]}: {len(differ_df)} Prediction differences of {len(training_data_x)} (Confident: {len(model_confident_predictions)}/{len(model_non_confident_predictions)} - {"{:.2%}".format(((len(training_data_x)/len(non_confident_predictions_Y))))})')
'''

'\npredictions,predictions_proba, model_names, scores = fit_predict_all(models,non_confident_predictions_X)\nfor model_index in range(0,5):\n    model_predictions = pd.DataFrame(predictions[model_index],columns=[\'best_class\'])\n    model_probas = predictions_proba[model_index]\n    differ_df = model_predictions.compare(training_data_x[\'best_class\'])\n    model_confident_predictions = model_probas.loc[model_probas["highest_prob"] >= 0.85] \n    model_non_confident_predictions = model_probas.loc[model_probas["highest_prob"] <= 0.85] \n    print(f\'{model_names[model_index]}: {len(differ_df)} Prediction differences of {len(training_data_x)} (Confident: {len(model_confident_predictions)}/{len(model_non_confident_predictions)} - {"{:.2%}".format(((len(training_data_x)/len(non_confident_predictions_Y))))})\')\n'

 predictions[1]
#models

array([1, 6, 1, ..., 5, 0, 2], dtype=int64)

 def update_predictions(predict_number, y):
    for i in range(predict_number):
        print("=========", i, "==========")
        X_scaled_sample = X.sample(50000)
        y_sample = y.loc[X_scaled_sample.index]
        
        bgmmC = BayesianGMMClassifier(
        n_components=7,
        random_state = i,
        tol =1e-3,
        covariance_type = 'full',
        max_iter = 300,
        n_init=3,
        init_params='kmeans')
        
        bgmmC.fit(X_scaled_sample, y_sample)
        
        pred_probs = bgmmC.predict_proba(X)
        pred_probs = pd.DataFrame(pred_probs, columns=range(1,8))
        
        # lets score the cluster labels each iteration
        score_clusters(X, pred_probs.idxmax(axis=1), verbose=True)
        y = pred_probs.idxmax(axis=1)
        
    return pred_probs

 predicted_probabilities = update_predictions(predict_number=20, y=pd.DataFrame(predictions[1]))

========= 0 ==========

c:\Users\squal\anaconda3\envs\kaggle\lib\site-packages\sklearn\utils\validation.py:1111: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

David Bouldin score: 3.7653
Calinski Harabasz score: 3501.557
Silhouette score: 0.0320
========= 1 ==========
David Bouldin score: 3.8379
Calinski Harabasz score: 3413.155
Silhouette score: 0.0299
========= 2 ==========
David Bouldin score: 3.9001
Calinski Harabasz score: 3358.794
Silhouette score: 0.0288
========= 3 ==========
David Bouldin score: 3.9448
Calinski Harabasz score: 3323.060
Silhouette score: 0.0282
========= 4 ==========
David Bouldin score: 3.9776
Calinski Harabasz score: 3299.044
Silhouette score: 0.0278
========= 5 ==========
David Bouldin score: 3.9979
Calinski Harabasz score: 3285.136
Silhouette score: 0.0275
========= 6 ==========
David Bouldin score: 4.0156
Calinski Harabasz score: 3274.330
Silhouette score: 0.0273
========= 7 ==========
David Bouldin score: 4.0292
Calinski Harabasz score: 3267.195
Silhouette score: 0.0272
========= 8 ==========
David Bouldin score: 4.0384
Calinski Harabasz score: 3263.020
Silhouette score: 0.0271
========= 9 ==========
David Bouldin score: 4.0453
Calinski Harabasz score: 3258.194
Silhouette score: 0.0271
========= 10 ==========
David Bouldin score: 4.0425
Calinski Harabasz score: 3256.776
Silhouette score: 0.0270
========= 11 ==========
David Bouldin score: 4.0545
Calinski Harabasz score: 3253.768
Silhouette score: 0.0269
========= 12 ==========
David Bouldin score: 4.0526
Calinski Harabasz score: 3256.191
Silhouette score: 0.0271
========= 13 ==========
David Bouldin score: 4.0536
Calinski Harabasz score: 3254.550
Silhouette score: 0.0270
========= 14 ==========
David Bouldin score: 4.0526
Calinski Harabasz score: 3250.661
Silhouette score: 0.0269
========= 15 ==========
David Bouldin score: 4.0593
Calinski Harabasz score: 3250.454
Silhouette score: 0.0269
========= 16 ==========
David Bouldin score: 4.0491
Calinski Harabasz score: 3256.367
Silhouette score: 0.0270
========= 17 ==========

c:\Users\squal\anaconda3\envs\kaggle\lib\site-packages\sklearn\mixture\_base.py:286: ConvergenceWarning: Initialization 3 did not converge. Try different init parameters, or increase max_iter, tol or check for degenerate data.
  warnings.warn(

David Bouldin score: 4.0518
Calinski Harabasz score: 3256.396
Silhouette score: 0.0270
========= 18 ==========
David Bouldin score: 4.0504
Calinski Harabasz score: 3257.948
Silhouette score: 0.0271
========= 19 ==========
David Bouldin score: 4.0482
Calinski Harabasz score: 3258.636
Silhouette score: 0.0271

 for model_index in range(0,5):
    sample_submission["Predicted"] = predictions[model_index]
    sample_submission.to_csv(f'Submissions/{model_names[model_index]}_submission.csv', index=False)

 predictions[1]

array([1, 6, 1, ..., 5, 0, 2], dtype=int64)

 predicted_probabilities=best_class(predicted_probabilities)

 sample_submission["Predicted"] = predicted_probabilities['best_class']
sample_submission.to_csv('Submissions/improved_submission.csv', index=False)

 predicted_probabilities['best_class']

0        2
1        7
2        6
3        4
4        5
        ..
97995    7
97996    3
97997    6
97998    1
97999    3
Name: best_class, Length: 98000, dtype: int64

 pca = PCA(n_components=2)
reduced_data = pca.fit_transform(X)
df = pd.DataFrame({"x" : reduced_data[:,0], "y" : reduced_data[:,1], "clusters" : predicted_probabilities['best_class']})
plt.figure(figsize=(20, 10))
sns.scatterplot(x=df["x"], y=df["y"], hue=df["clusters"])

<AxesSubplot:xlabel='x', ylabel='y'>

	id	f_00	f_01	f_02	f_03	f_04	f_05	f_06	f_07	f_08	...	f_19	f_20	f_21	f_22	f_23	f_24	f_25	f_26	f_27	f_28
count	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	...	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000
mean	48999.500000	0.001220	0.005580	-0.001042	-0.000700	-0.003522	-0.001612	-0.003042	5.545918	6.763061	...	-0.004513	-0.000515	-0.001670	-0.038752	-0.220002	0.166434	-0.064309	-0.062540	0.098472	-0.230910
std	28290.307527	1.002801	1.000742	1.001373	1.000422	1.003061	1.000532	0.997434	3.691840	4.152348	...	1.004372	1.002962	0.999703	1.477858	1.494836	1.543014	1.576086	1.428055	1.305407	1.528476
min	0.000000	-4.732235	-4.202795	-4.377021	-4.010826	-4.535903	-4.300767	-4.894525	0.000000	0.000000	...	-4.894525	-4.732235	-4.438130	-6.873999	-8.234305	-7.792363	-6.593842	-7.375719	-7.335556	-6.954151
25%	24499.750000	-0.675226	-0.670985	-0.672779	-0.672540	-0.682510	-0.675066	-0.680421	3.000000	4.000000	...	-0.678773	-0.679777	-0.675147	-1.022964	-1.203204	-0.903385	-1.128966	-0.975680	-0.746489	-1.262606
50%	48999.500000	0.002022	0.006650	-0.000324	-0.003185	-0.003307	0.001024	-0.002053	5.000000	6.000000	...	-0.000587	-0.000806	0.000819	-0.056687	-0.219046	0.167074	-0.099221	-0.070852	0.082230	-0.271319
75%	73499.250000	0.677271	0.677746	0.677086	0.672097	0.677589	0.673344	0.668112	8.000000	9.000000	...	0.672149	0.675437	0.676881	0.930158	0.764690	1.217432	0.987684	0.843212	0.925306	0.770516
max	97999.000000	4.490521	4.324974	4.560247	4.399373	4.050549	4.710316	3.998595	32.000000	30.000000	...	4.560247	4.399373	4.135419	6.517721	6.054831	7.527271	7.544731	7.005608	7.205971	6.977150

	0	1	2	3	4	5	6	highest_prob	best_class	second_highest_prob	second_best_class
19083	0.000649	9.953711e-01	3.162222e-07	1.158325e-13	3.976745e-03	0.000003	1.980883e-08	0.995371	1	0.003977	5
39374	0.000009	1.346527e-03	2.084187e-05	3.001399e-05	1.033426e-01	0.894909	3.424589e-04	0.894909	5	0.103343	5
55642	0.000017	3.616600e-10	1.089633e-06	9.989492e-01	1.930447e-10	0.001029	3.145058e-06	0.998949	3	0.001029	6
40472	0.390173	3.604358e-03	1.072369e-05	2.900247e-05	1.562962e-01	0.025462	4.244249e-01	0.424425	6	0.390173	1
24163	0.147019	1.121572e-02	3.818412e-01	1.876826e-06	2.264757e-01	0.000484	2.329631e-01	0.381841	2	0.232963	7

	f_07	f_08	f_09	f_10	f_11	f_12	f_13	f_21	f_22	f_23	f_24	f_25	f_26	f_27	f_28	best_class	highest_prob
0	-0.977987	1.383372	1.039938	-0.567775	1.142180	-0.083596	0.086073	-0.763110	-0.707876	0.911477	-0.678852	0.768543	0.960344	1.042536	0.694234	1	0.977823
1	-0.977987	-0.875405	-0.179925	-0.837020	-0.420725	1.725834	0.738456	-1.777545	-0.535582	0.453824	1.031505	-0.117686	-0.550783	0.367242	-1.636652	6	0.938950
2	0.021718	1.017648	-0.394246	0.124844	0.296135	-0.928325	1.118063	1.194423	2.203065	0.086974	-1.519163	-0.568662	0.978900	-0.926277	-2.296373	1	0.822492
3	0.286548	-1.213526	0.917564	0.124844	0.296135	-0.083596	-0.434221	0.532884	0.731623	-1.218086	0.826492	-1.173592	-0.395085	-0.100021	0.326682	3	0.854428
4	0.756900	0.187543	-0.394246	-1.135381	-1.954502	1.271662	1.118063	0.321899	0.228337	-1.482684	0.847999	-0.613935	1.164389	-0.374203	-1.160058	4	0.979577

Marcelino Mayorga Quesada ¶

1. Competition Info (July)¶

2. Clustering Overview¶

3. Pipelines¶

4. Data Pipeline¶

Imports¶

GetData¶

Exploratory Data Analysis (EDA)¶

High Level Data Information¶

Shape¶

Info¶

Describe¶

Null / NA¶

Correlation¶

Column Histograms¶

Summary¶

Prepare Data¶

Remove ID Column¶

Preprocessing¶

#¶

Optimal Cluster count with Distortion Elbow Method (Kmeans Clustering)¶

Class / Cluster InterDistance¶

Approaches¶

Data Scaling¶

BayesianGaussianMixture - Raw cluster prediction¶

Identify Class Classification Difficulties¶

Prepare Data¶

Class / Cluster Balance (SMOTE)¶

Supervised Learning Cross Validation¶

Prepare data¶

Train Models with confident predictions¶

Evaluate Models with non confident predictions¶

Inference¶

Predict with all¶

Predict with only non_confident¶

Improvements¶

Submission¶

Predictions¶

PCA¶

	best_class
	self	other
14	1.0	2.0
15	6.0	1.0
17	4.0	0.0
24	5.0	4.0
25	2.0	6.0

Marcelino Mayorga Quesada¶

1. Competition Info (July)¶

2. Clustering Overview¶

3. Pipelines¶

4. Data Pipeline¶

Imports¶

GetData¶

Exploratory Data Analysis (EDA)¶

High Level Data Information¶

Shape¶

Info¶

Describe¶

Null / NA¶

Correlation¶

Column Histograms¶

Summary¶

Prepare Data¶

Remove ID Column¶

Preprocessing¶

#¶

Optimal Cluster count with Distortion Elbow Method (Kmeans Clustering)¶

Class / Cluster InterDistance¶

Approaches¶

Data Scaling¶

BayesianGaussianMixture - Raw cluster prediction¶

Identify Class Classification Difficulties¶

Prepare Data¶

Class / Cluster Balance (SMOTE)¶

Supervised Learning Cross Validation¶

Prepare data¶

Train Models with confident predictions¶

Evaluate Models with non confident predictions¶

Inference¶

Predict with all¶

Predict with only non_confident¶

Improvements¶

Submission¶

Predictions¶

PCA¶

Marcelino Mayorga Quesada ¶