- Link: https://www.kaggle.com/competitions/tabular-playground-series-jul-2022/
- Info: "In this challenge, you are given a dataset where each row belongs to a particular cluster. Your job is to predict the cluster each row belongs to. You are not given any training data, and you are not told how many clusters are found in the ground truth labels."
- Evaluation: "Submissions are evaluated on the Adjusted Rand Index between the ground truth cluster labels of the data and your predicted cluster labels. You are not given the number of ground truth clusters or any training labels. This is a completely unsupervised problem"
azminetoushikwasi provides a great Explanation on clustering different techniques algorithms
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
import mlflow
import yellowbrick
from numpy import unique
from numpy import where
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, adjusted_rand_score, confusion_matrix
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklego.mixture import BayesianGMMClassifier
is_kaggle=False
if is_kaggle is False:
data = pd.read_csv('data.csv')
sample_submission = pd.read_csv("Submissions/sample_submission.csv")
else:
data = pd.read_csv("../input/tabular-playground-series-jul-2022/data.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-jul-2022/sample_submission.csv")
data.shape
(98000, 30)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 98000 entries, 0 to 97999 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 98000 non-null int64 1 f_00 98000 non-null float64 2 f_01 98000 non-null float64 3 f_02 98000 non-null float64 4 f_03 98000 non-null float64 5 f_04 98000 non-null float64 6 f_05 98000 non-null float64 7 f_06 98000 non-null float64 8 f_07 98000 non-null int64 9 f_08 98000 non-null int64 10 f_09 98000 non-null int64 11 f_10 98000 non-null int64 12 f_11 98000 non-null int64 13 f_12 98000 non-null int64 14 f_13 98000 non-null int64 15 f_14 98000 non-null float64 16 f_15 98000 non-null float64 17 f_16 98000 non-null float64 18 f_17 98000 non-null float64 19 f_18 98000 non-null float64 20 f_19 98000 non-null float64 21 f_20 98000 non-null float64 22 f_21 98000 non-null float64 23 f_22 98000 non-null float64 24 f_23 98000 non-null float64 25 f_24 98000 non-null float64 26 f_25 98000 non-null float64 27 f_26 98000 non-null float64 28 f_27 98000 non-null float64 29 f_28 98000 non-null float64 dtypes: float64(22), int64(8) memory usage: 22.4 MB
data.describe()
id | f_00 | f_01 | f_02 | f_03 | f_04 | f_05 | f_06 | f_07 | f_08 | ... | f_19 | f_20 | f_21 | f_22 | f_23 | f_24 | f_25 | f_26 | f_27 | f_28 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | ... | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 | 98000.000000 |
mean | 48999.500000 | 0.001220 | 0.005580 | -0.001042 | -0.000700 | -0.003522 | -0.001612 | -0.003042 | 5.545918 | 6.763061 | ... | -0.004513 | -0.000515 | -0.001670 | -0.038752 | -0.220002 | 0.166434 | -0.064309 | -0.062540 | 0.098472 | -0.230910 |
std | 28290.307527 | 1.002801 | 1.000742 | 1.001373 | 1.000422 | 1.003061 | 1.000532 | 0.997434 | 3.691840 | 4.152348 | ... | 1.004372 | 1.002962 | 0.999703 | 1.477858 | 1.494836 | 1.543014 | 1.576086 | 1.428055 | 1.305407 | 1.528476 |
min | 0.000000 | -4.732235 | -4.202795 | -4.377021 | -4.010826 | -4.535903 | -4.300767 | -4.894525 | 0.000000 | 0.000000 | ... | -4.894525 | -4.732235 | -4.438130 | -6.873999 | -8.234305 | -7.792363 | -6.593842 | -7.375719 | -7.335556 | -6.954151 |
25% | 24499.750000 | -0.675226 | -0.670985 | -0.672779 | -0.672540 | -0.682510 | -0.675066 | -0.680421 | 3.000000 | 4.000000 | ... | -0.678773 | -0.679777 | -0.675147 | -1.022964 | -1.203204 | -0.903385 | -1.128966 | -0.975680 | -0.746489 | -1.262606 |
50% | 48999.500000 | 0.002022 | 0.006650 | -0.000324 | -0.003185 | -0.003307 | 0.001024 | -0.002053 | 5.000000 | 6.000000 | ... | -0.000587 | -0.000806 | 0.000819 | -0.056687 | -0.219046 | 0.167074 | -0.099221 | -0.070852 | 0.082230 | -0.271319 |
75% | 73499.250000 | 0.677271 | 0.677746 | 0.677086 | 0.672097 | 0.677589 | 0.673344 | 0.668112 | 8.000000 | 9.000000 | ... | 0.672149 | 0.675437 | 0.676881 | 0.930158 | 0.764690 | 1.217432 | 0.987684 | 0.843212 | 0.925306 | 0.770516 |
max | 97999.000000 | 4.490521 | 4.324974 | 4.560247 | 4.399373 | 4.050549 | 4.710316 | 3.998595 | 32.000000 | 30.000000 | ... | 4.560247 | 4.399373 | 4.135419 | 6.517721 | 6.054831 | 7.527271 | 7.544731 | 7.005608 | 7.205971 | 6.977150 |
8 rows × 30 columns
na_counter = data.isna().sum().sum()
null_counter = data.isnull().sum().sum()
print(f'NA: {na_counter} / NULL: {null_counter}')
NA: 0 / NULL: 0
sns.set(rc={'figure.figsize':(20,24)})
sns.heatmap(data.corr(),annot=True,fmt='.2f')
<AxesSubplot:>
data.hist(bins = 50, figsize=(25, 25))
plt.show()
data = data.drop(['id'], axis=1)
data.shape
(98000, 29)
Yellowbrick provides a wrapper to handle the distortion with scores on:
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
def show_k_elbow_visualizer(data):
elbow_visualizer = KElbowVisualizer(KMeans(), k=(4,13))
elbow_visualizer.fit(data)
elbow_visualizer.show()
return elbow_visualizer.estimator,elbow_visualizer.elbow_value_
def show_silhouetteVisualizer(estimator,data):
visualizer = SilhouetteVisualizer(estimator)
visualizer.fit(data)
visualizer.show()
estimator, cluster_count = show_k_elbow_visualizer(data)
Some clusters overlap with other clusters which will cause noise in order to identify them properly.
from yellowbrick.cluster import InterclusterDistance
def show_interclusterdistance(data):
# Instantiate the clustering model and visualizer
model = KMeans(cluster_count)
visualizer = InterclusterDistance(model)
visualizer.fit(data) # Fit the data to the visualizer
visualizer.show() # Finalize and render the figure
show_interclusterdistance(data)
#Generate selected columns, section 1: columns [f7-14], columns section 2: [21-29]
cluster_section_1_column_list = [f'f_{i:02d}' for i in range(7,14)]
cluster_section_2_column_list = [f'f_{i:02d}' for i in range(21,29)]
cluster_column_list = cluster_section_1_column_list + cluster_section_2_column_list
cluster_data = data[cluster_column_list]
scaled_cluster_data = PowerTransformer().fit_transform(cluster_data)
training_data = pd.DataFrame(scaled_cluster_data,columns=cluster_column_list)
#training_data = training_data.astype(float64)
training_data.hist(bins = 50, figsize=(25, 25))
plt.show()
#show_k_elbow_visualizer(training_data)
#show_interclusterdistance(training_data)
# define the model
bgm_model = BayesianGaussianMixture(n_components=cluster_count,covariance_type='full',random_state=1)
# assign each data point to a cluster
bgm_model_predictions = pd.DataFrame(bgm_model.fit_predict(training_data))
bgm_model_proba = pd.DataFrame(bgm_model.predict_proba(training_data))
bgm_model_labels=unique(bgm_model_predictions)
# get all of the unique clusters
print(bgm_model_labels)
[0 1 2 3 4 5 6]
#show_silhouetteVisualizer(KMeans(),training_data)
def best_class(df):
new_df = df.copy()
new_df["highest_prob"] = df.max(axis=1)
new_df["best_class"] = df.idxmax(axis=1)
new_df["second_highest_prob"] = df.apply(lambda x: x.nlargest(2).values[-1], axis=1)
new_df["second_best_class"] = df.apply(lambda x: np.where(x == x.nlargest(2).values[-1])[0][0]+1, axis=1)
#new_df["class_differ"] = new_df["highest_prob"]- new_df["second_highest_prob"]
return new_df
bgm_model_proba_df=best_class(bgm_model_proba)
bgm_model_proba_df.sample(5)
0 | 1 | 2 | 3 | 4 | 5 | 6 | highest_prob | best_class | second_highest_prob | second_best_class | |
---|---|---|---|---|---|---|---|---|---|---|---|
19083 | 0.000649 | 9.953711e-01 | 3.162222e-07 | 1.158325e-13 | 3.976745e-03 | 0.000003 | 1.980883e-08 | 0.995371 | 1 | 0.003977 | 5 |
39374 | 0.000009 | 1.346527e-03 | 2.084187e-05 | 3.001399e-05 | 1.033426e-01 | 0.894909 | 3.424589e-04 | 0.894909 | 5 | 0.103343 | 5 |
55642 | 0.000017 | 3.616600e-10 | 1.089633e-06 | 9.989492e-01 | 1.930447e-10 | 0.001029 | 3.145058e-06 | 0.998949 | 3 | 0.001029 | 6 |
40472 | 0.390173 | 3.604358e-03 | 1.072369e-05 | 2.900247e-05 | 1.562962e-01 | 0.025462 | 4.244249e-01 | 0.424425 | 6 | 0.390173 | 1 |
24163 | 0.147019 | 1.121572e-02 | 3.818412e-01 | 1.876826e-06 | 2.264757e-01 | 0.000484 | 2.329631e-01 | 0.381841 | 2 | 0.232963 | 7 |
training_data_x = pd.concat([training_data,bgm_model_proba_df[['best_class','highest_prob']]], axis=1)
training_data_x.head()
f_07 | f_08 | f_09 | f_10 | f_11 | f_12 | f_13 | f_21 | f_22 | f_23 | f_24 | f_25 | f_26 | f_27 | f_28 | best_class | highest_prob | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.977987 | 1.383372 | 1.039938 | -0.567775 | 1.142180 | -0.083596 | 0.086073 | -0.763110 | -0.707876 | 0.911477 | -0.678852 | 0.768543 | 0.960344 | 1.042536 | 0.694234 | 1 | 0.977823 |
1 | -0.977987 | -0.875405 | -0.179925 | -0.837020 | -0.420725 | 1.725834 | 0.738456 | -1.777545 | -0.535582 | 0.453824 | 1.031505 | -0.117686 | -0.550783 | 0.367242 | -1.636652 | 6 | 0.938950 |
2 | 0.021718 | 1.017648 | -0.394246 | 0.124844 | 0.296135 | -0.928325 | 1.118063 | 1.194423 | 2.203065 | 0.086974 | -1.519163 | -0.568662 | 0.978900 | -0.926277 | -2.296373 | 1 | 0.822492 |
3 | 0.286548 | -1.213526 | 0.917564 | 0.124844 | 0.296135 | -0.083596 | -0.434221 | 0.532884 | 0.731623 | -1.218086 | 0.826492 | -1.173592 | -0.395085 | -0.100021 | 0.326682 | 3 | 0.854428 |
4 | 0.756900 | 0.187543 | -0.394246 | -1.135381 | -1.954502 | 1.271662 | 1.118063 | 0.321899 | 0.228337 | -1.482684 | 0.847999 | -0.613935 | 1.164389 | -0.374203 | -1.160058 | 4 | 0.979577 |
confident_predictions = training_data_x.loc[training_data_x["highest_prob"] >= 0.85] #Training
non_confident_predictions = training_data_x.loc[training_data_x["highest_prob"] < 0.85] #Predict
print(f'confident: {len(confident_predictions)}')
print(f'non-confident:{len(non_confident_predictions)}')
print(f'total = {len(non_confident_predictions+confident_predictions)}')
confident: 56455 non-confident:41545 total = 98000
As for it is supervised, it is needed to balance for training
from yellowbrick.target import class_balance
def show_cluster_class_balance(results):
sns.set(rc={'figure.figsize':(5,5)})
visualizer = class_balance(results)
show_cluster_class_balance(confident_predictions['best_class'])
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
balanced_confident_predictions_Y = confident_predictions['best_class']
balanced_confident_predictions_X =confident_predictions.copy().drop(['best_class','highest_prob'],axis=1)
balanced_confident_predictions_X, balanced_confident_predictions_Y = oversample.fit_resample(balanced_confident_predictions_X,balanced_confident_predictions_Y )
print(f'before: {len(confident_predictions)} - after: {len(balanced_confident_predictions_X)}')
before: 56455 - after: 71547
show_cluster_class_balance(balanced_confident_predictions_Y)
model_et = ExtraTreesClassifier(n_estimators = 2000,
n_jobs = -1,
random_state=42
)
model_lgbm = LGBMClassifier(objective = 'multiclass',
n_estimators = 5000,
random_state = 42,
learning_rate = 0.1,
n_jobs = -1)
model_qda = QuadraticDiscriminantAnalysis()
model_lda = LinearDiscriminantAnalysis()
model_bgmm = BayesianGMMClassifier(
n_components=cluster_count,
random_state = 42,
tol =1e-3,
covariance_type = 'full',
max_iter = 400,
n_init=4,
init_params='kmeans')
models = {"ET":model_et, "LGBM":model_lgbm, "QDA":model_qda, "LDA":model_lda, "BGMM_C":model_bgmm}
def evaluate_models(training_data,label_data,verbose=False):
for model_name, model in models.items():
print("===",model_name,"===")
feature_imp, y_pred_list, y_true_list, acc_list, X_val, y_val = k_fold_cv(model=model,X=training_data,y=label_data, verbose=verbose)
acc_score = accuracy_score(y_pred_list, y_true_list)
print("{0:0.4f}".format(acc_score))
def k_fold_cv(model,X,y, verbose=True):
kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state = 0)
feature_imp, y_pred_list, y_true_list, acc_list = [],[],[],[]
for fold, (train_index, val_index) in enumerate(kfold.split(X, y)):
if verbose: print("==fold==", fold)
X_train = X.loc[train_index]
X_val = X.loc[val_index]
y_train = y.loc[train_index]
y_val = y.loc[val_index]
model.fit(X_train,y_train)
y_pred = model.predict(X_val)
y_pred_list = np.append(y_pred_list, y_pred)
y_true_list = np.append(y_true_list, y_val)
acc_list.append(accuracy_score(y_pred, y_val))
if verbose: print('Acc', accuracy_score(y_pred, y_val))
try:
feature_imp.append(model.feature_importances_)
except AttributeError as e: # if model does not have .feature_importances_ attribute
print(e)
pass # returns empty list
return feature_imp, y_pred_list, y_true_list, acc_list, X_val, y_val
evaluate_models(balanced_confident_predictions_X,balanced_confident_predictions_Y)
=== ET === 0.9925 === LGBM === 0.9980 === QDA === 'QuadraticDiscriminantAnalysis' object has no attribute 'feature_importances_' 'QuadraticDiscriminantAnalysis' object has no attribute 'feature_importances_' 'QuadraticDiscriminantAnalysis' object has no attribute 'feature_importances_' 'QuadraticDiscriminantAnalysis' object has no attribute 'feature_importances_' 'QuadraticDiscriminantAnalysis' object has no attribute 'feature_importances_' 0.9999 === LDA === 'LinearDiscriminantAnalysis' object has no attribute 'feature_importances_' 'LinearDiscriminantAnalysis' object has no attribute 'feature_importances_' 'LinearDiscriminantAnalysis' object has no attribute 'feature_importances_' 'LinearDiscriminantAnalysis' object has no attribute 'feature_importances_' 'LinearDiscriminantAnalysis' object has no attribute 'feature_importances_' 0.8547 === BGMM_C === 'BayesianGMMClassifier' object has no attribute 'feature_importances_' 'BayesianGMMClassifier' object has no attribute 'feature_importances_' 'BayesianGMMClassifier' object has no attribute 'feature_importances_' 'BayesianGMMClassifier' object has no attribute 'feature_importances_' 'BayesianGMMClassifier' object has no attribute 'feature_importances_' 0.9920
non_confident_predictions_Y = pd.DataFrame(non_confident_predictions['best_class'] ,columns=['best_class']).reset_index(drop=True)
non_confident_predictions_X = non_confident_predictions.copy().drop(['best_class','highest_prob'],axis=1)
model_non_confident_predictions = pd.DataFrame(models['LGBM'].predict(non_confident_predictions_X),columns=['best_class'])
model_non_confident_predictions_probas = pd.DataFrame(models['LGBM'].predict_proba(non_confident_predictions_X))
differ_df = model_non_confident_predictions.compare(non_confident_predictions_Y)
print(f'{len(differ_df)} of {len(non_confident_predictions_Y)}')
6966 of 41545
differ_df.head()
best_class | ||
---|---|---|
self | other | |
14 | 1.0 | 2.0 |
15 | 6.0 | 1.0 |
17 | 4.0 | 0.0 |
24 | 5.0 | 4.0 |
25 | 2.0 | 6.0 |
model_non_confident_predictions_probas = best_class(model_non_confident_predictions_probas)
model_non_confident_predictions_probas.iloc[24]
0 4.885802e-14 1 4.400439e-13 2 2.749495e-12 3 2.810474e-12 4 1.148052e-01 5 8.851948e-01 6 1.227265e-11 highest_prob 8.851948e-01 best_class 5.000000e+00 second_highest_prob 1.148052e-01 second_best_class 5.000000e+00 Name: 24, dtype: float64
def score_clusters(X, predictions, silhouette = True, verbose=False):
"""Evaluate how good our cluster label predictions are"""
db_score = davies_bouldin_score(X=X, labels=predictions)
ch_score = calinski_harabasz_score(X=X, labels=predictions)
#the silhouette score is the slowest to compute ~90 secs
s_score = silhouette_score(X=X, labels=predictions, metric='euclidean')
if verbose:
print("David Bouldin score: {0:0.4f}".format(db_score))
print("Calinski Harabasz score: {0:0.3f}".format(ch_score))
print("Silhouette score: {0:0.4f}".format(s_score))
return db_score, ch_score, s_score
def fit_predict_all(models,x):
predictions = []
predictions_proba = []
model_names = []
scores = []
for model_name, model in models.items():
print("===",model_name,"===")
#model.set_params(warm_start=True)
#model.fit(x,y)
preds = model.predict(x)
preds_prob = model.predict_proba(x)
preds_prob_df = pd.DataFrame(preds_prob, columns=range(1,8))#, index=X_scaled.index)
db, ch, s = score_clusters(x, preds_prob_df.idxmax(axis=1), verbose=True)
scores.append((db,ch,s))
predictions.append(preds)
predictions_proba.append(best_class(preds_prob_df))
model_names.append(model_name)
return predictions,predictions_proba, model_names, scores
Y=pd.DataFrame(training_data_x['best_class'],columns=['best_class'])
X=training_data_x.copy().drop(['best_class','highest_prob'],axis=1)
predictions,predictions_proba, model_names, scores = fit_predict_all(models,X)
=== ET === David Bouldin score: 3.5427 Calinski Harabasz score: 3713.525 Silhouette score: 0.0382 === LGBM === David Bouldin score: 3.6965 Calinski Harabasz score: 3586.804 Silhouette score: 0.0347 === QDA === David Bouldin score: 3.6544 Calinski Harabasz score: 3607.327 Silhouette score: 0.0346 === LDA === David Bouldin score: 3.1453 Calinski Harabasz score: 4338.890 Silhouette score: 0.0532 === BGMM_C === David Bouldin score: 3.7492 Calinski Harabasz score: 3518.418 Silhouette score: 0.0323
for model_index in range(0,5):
model_predictions = pd.DataFrame(predictions[model_index],columns=['best_class'])
model_probas = predictions_proba[model_index]
differ_df = model_predictions.compare(Y)
model_confident_predictions = model_probas.loc[model_probas["highest_prob"] >= 0.85]
model_non_confident_predictions = model_probas.loc[model_probas["highest_prob"] <= 0.85]
print(f'{model_names[model_index]}: {len(differ_df)} Prediction differences of {len(training_data_x)} (Confident: {len(model_confident_predictions)}/{len(model_non_confident_predictions)} - {"{:.2%}".format(((len(model_confident_predictions)/len(training_data_x))))})')
ET: 10223 Prediction differences of 98000 (Confident: 50626/47389 - 51.66%) LGBM: 6997 Prediction differences of 98000 (Confident: 91298/6702 - 93.16%) QDA: 4683 Prediction differences of 98000 (Confident: 73841/24159 - 75.35%) LDA: 25527 Prediction differences of 98000 (Confident: 42088/55912 - 42.95%) BGMM_C: 10333 Prediction differences of 98000 (Confident: 83800/14200 - 85.51%)
'''
predictions,predictions_proba, model_names, scores = fit_predict_all(models,non_confident_predictions_X)
for model_index in range(0,5):
model_predictions = pd.DataFrame(predictions[model_index],columns=['best_class'])
model_probas = predictions_proba[model_index]
differ_df = model_predictions.compare(training_data_x['best_class'])
model_confident_predictions = model_probas.loc[model_probas["highest_prob"] >= 0.85]
model_non_confident_predictions = model_probas.loc[model_probas["highest_prob"] <= 0.85]
print(f'{model_names[model_index]}: {len(differ_df)} Prediction differences of {len(training_data_x)} (Confident: {len(model_confident_predictions)}/{len(model_non_confident_predictions)} - {"{:.2%}".format(((len(training_data_x)/len(non_confident_predictions_Y))))})')
'''
'\npredictions,predictions_proba, model_names, scores = fit_predict_all(models,non_confident_predictions_X)\nfor model_index in range(0,5):\n model_predictions = pd.DataFrame(predictions[model_index],columns=[\'best_class\'])\n model_probas = predictions_proba[model_index]\n differ_df = model_predictions.compare(training_data_x[\'best_class\'])\n model_confident_predictions = model_probas.loc[model_probas["highest_prob"] >= 0.85] \n model_non_confident_predictions = model_probas.loc[model_probas["highest_prob"] <= 0.85] \n print(f\'{model_names[model_index]}: {len(differ_df)} Prediction differences of {len(training_data_x)} (Confident: {len(model_confident_predictions)}/{len(model_non_confident_predictions)} - {"{:.2%}".format(((len(training_data_x)/len(non_confident_predictions_Y))))})\')\n'
predictions[1]
#models
array([1, 6, 1, ..., 5, 0, 2], dtype=int64)
def update_predictions(predict_number, y):
for i in range(predict_number):
print("=========", i, "==========")
X_scaled_sample = X.sample(50000)
y_sample = y.loc[X_scaled_sample.index]
bgmmC = BayesianGMMClassifier(
n_components=7,
random_state = i,
tol =1e-3,
covariance_type = 'full',
max_iter = 300,
n_init=3,
init_params='kmeans')
bgmmC.fit(X_scaled_sample, y_sample)
pred_probs = bgmmC.predict_proba(X)
pred_probs = pd.DataFrame(pred_probs, columns=range(1,8))
# lets score the cluster labels each iteration
score_clusters(X, pred_probs.idxmax(axis=1), verbose=True)
y = pred_probs.idxmax(axis=1)
return pred_probs
predicted_probabilities = update_predictions(predict_number=20, y=pd.DataFrame(predictions[1]))
========= 0 ==========
c:\Users\squal\anaconda3\envs\kaggle\lib\site-packages\sklearn\utils\validation.py:1111: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
David Bouldin score: 3.7653 Calinski Harabasz score: 3501.557 Silhouette score: 0.0320 ========= 1 ========== David Bouldin score: 3.8379 Calinski Harabasz score: 3413.155 Silhouette score: 0.0299 ========= 2 ========== David Bouldin score: 3.9001 Calinski Harabasz score: 3358.794 Silhouette score: 0.0288 ========= 3 ========== David Bouldin score: 3.9448 Calinski Harabasz score: 3323.060 Silhouette score: 0.0282 ========= 4 ========== David Bouldin score: 3.9776 Calinski Harabasz score: 3299.044 Silhouette score: 0.0278 ========= 5 ========== David Bouldin score: 3.9979 Calinski Harabasz score: 3285.136 Silhouette score: 0.0275 ========= 6 ========== David Bouldin score: 4.0156 Calinski Harabasz score: 3274.330 Silhouette score: 0.0273 ========= 7 ========== David Bouldin score: 4.0292 Calinski Harabasz score: 3267.195 Silhouette score: 0.0272 ========= 8 ========== David Bouldin score: 4.0384 Calinski Harabasz score: 3263.020 Silhouette score: 0.0271 ========= 9 ========== David Bouldin score: 4.0453 Calinski Harabasz score: 3258.194 Silhouette score: 0.0271 ========= 10 ========== David Bouldin score: 4.0425 Calinski Harabasz score: 3256.776 Silhouette score: 0.0270 ========= 11 ========== David Bouldin score: 4.0545 Calinski Harabasz score: 3253.768 Silhouette score: 0.0269 ========= 12 ========== David Bouldin score: 4.0526 Calinski Harabasz score: 3256.191 Silhouette score: 0.0271 ========= 13 ========== David Bouldin score: 4.0536 Calinski Harabasz score: 3254.550 Silhouette score: 0.0270 ========= 14 ========== David Bouldin score: 4.0526 Calinski Harabasz score: 3250.661 Silhouette score: 0.0269 ========= 15 ========== David Bouldin score: 4.0593 Calinski Harabasz score: 3250.454 Silhouette score: 0.0269 ========= 16 ========== David Bouldin score: 4.0491 Calinski Harabasz score: 3256.367 Silhouette score: 0.0270 ========= 17 ==========
c:\Users\squal\anaconda3\envs\kaggle\lib\site-packages\sklearn\mixture\_base.py:286: ConvergenceWarning: Initialization 3 did not converge. Try different init parameters, or increase max_iter, tol or check for degenerate data. warnings.warn(
David Bouldin score: 4.0518 Calinski Harabasz score: 3256.396 Silhouette score: 0.0270 ========= 18 ========== David Bouldin score: 4.0504 Calinski Harabasz score: 3257.948 Silhouette score: 0.0271 ========= 19 ========== David Bouldin score: 4.0482 Calinski Harabasz score: 3258.636 Silhouette score: 0.0271
for model_index in range(0,5):
sample_submission["Predicted"] = predictions[model_index]
sample_submission.to_csv(f'Submissions/{model_names[model_index]}_submission.csv', index=False)
predictions[1]
array([1, 6, 1, ..., 5, 0, 2], dtype=int64)
predicted_probabilities=best_class(predicted_probabilities)
sample_submission["Predicted"] = predicted_probabilities['best_class']
sample_submission.to_csv('Submissions/improved_submission.csv', index=False)
predicted_probabilities['best_class']
0 2 1 7 2 6 3 4 4 5 .. 97995 7 97996 3 97997 6 97998 1 97999 3 Name: best_class, Length: 98000, dtype: int64
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(X)
df = pd.DataFrame({"x" : reduced_data[:,0], "y" : reduced_data[:,1], "clusters" : predicted_probabilities['best_class']})
plt.figure(figsize=(20, 10))
sns.scatterplot(x=df["x"], y=df["y"], hue=df["clusters"])
<AxesSubplot:xlabel='x', ylabel='y'>