from catboost import CatBoostClassifier
from matplotlib import pyplot as plt
from matplotlib import style
%matplotlib inline
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from pycaret.classification import *
import seaborn as sns
import shap
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
import time

start = time.time()

 client_data = pd.read_csv("archive/client_train.csv") 
invoice_data = pd.read_csv("archive/invoice_train.csv")

 client_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135493 entries, 0 to 135492
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   disrict        135493 non-null  int64  
 1   client_id      135493 non-null  object 
 2   client_catg    135493 non-null  int64  
 3   region         135493 non-null  int64  
 4   creation_date  135493 non-null  object 
 5   target         135493 non-null  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 6.2+ MB

 invoice_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4476749 entries, 0 to 4476748
Data columns (total 16 columns):
 #   Column                Dtype 
---  ------                ----- 
 0   client_id             object
 1   invoice_date          object
 2   tarif_type            int64 
 3   counter_number        int64 
 4   counter_statue        object
 5   counter_code          int64 
 6   reading_remarque      int64 
 7   counter_coefficient   int64 
 8   consommation_level_1  int64 
 9   consommation_level_2  int64 
 10  consommation_level_3  int64 
 11  consommation_level_4  int64 
 12  old_index             int64 
 13  new_index             int64 
 14  months_number         int64 
 15  counter_type          object
dtypes: int64(12), object(4)
memory usage: 546.5+ MB

 profile_client_data = ProfileReport(client_data, minimal=True, title="Fraude - Dataset Cliente")
profile_client_data.to_widgets()

Summarize dataset: 100%|██████████| 12/12 [00:00<00:00, 33.32it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.30it/s]

 invoice_sample=invoice_data.sample(frac=0.5)
profile_invoice_data = ProfileReport(invoice_sample, minimal=True, title="Fraude - Dataset Facturacion")
profile_invoice_data.to_widgets()

Summarize dataset: 100%|██████████| 23/23 [00:07<00:00,  3.27it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.14s/it]

 client_data.hist()
plt.show()

 invoice_data.hist()
plt.show()

 def feature_preparation(cl, inv):
    ''' Feature Engineering: Creation, Transformation, Mapping, etc
    '''
    #Tipos - Cliente
    cl['client_catg'] = cl['client_catg'].astype('category')
    cl['disrict'] = cl['disrict'].astype('category')
    cl['region'] = cl['region'].astype('category')
    #Mapeos - Cliente
    cl['region_group'] = cl['region'].apply(lambda x: 100 if x<100 else 300 if x>300 else 200)
    #Conversiones y nuevas columnas - Cliente
    cl['creation_date'] = pd.to_datetime(cl['creation_date'])
    cl['coop_time'] = (2019 - cl['creation_date'].dt.year)*12 - cl['creation_date'].dt.month
    #Mapeos - Cliente
    inv['counter_type'] = inv['counter_type'].map({"ELEC":1,"GAZ":0})
    inv['counter_statue'] = inv['counter_statue'].map({0:0,1:1,2:2,3:3,4:4,5:5,769:5,'0':0,'5':5,'1':1,'4':4,'A':0,618:5,269375:5,46:5,420:5})
    #Conversiones y nuevas columnas - Facturación
    inv['invoice_date'] = pd.to_datetime(inv['invoice_date'], dayfirst=True)
    inv['invoice_month'] = inv['invoice_date'].dt.month
    inv['invoice_year'] = inv['invoice_date'].dt.year
    inv['is_weekday'] = ((pd.DatetimeIndex(inv.invoice_date).dayofweek) // 5 == 1).astype(float)
    inv['delta_index'] = inv['new_index'] - inv['old_index']
    return cl, inv


def agg_feature(invoice, client_df, agg_stat):
    invoice['delta_time'] = invoice.sort_values(['client_id','invoice_date']).groupby('client_id')['invoice_date'].diff().dt.days.reset_index(drop=True)
    agg_trans = invoice.groupby('client_id')[agg_stat+['delta_time']].agg(['mean','std','min','max'])
    agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)
    df = invoice.groupby('client_id').size().reset_index(name='transactions_count')
    agg_trans = pd.merge(df, agg_trans, on='client_id', how='left')
    weekday_avg = invoice.groupby('client_id')[['is_weekday']].agg(['mean'])
    weekday_avg.columns = ['_'.join(col).strip() for col in weekday_avg.columns.values]
    weekday_avg.reset_index(inplace=True)
    client_df = pd.merge(client_df, weekday_avg, on='client_id', how='left')
    full_df = pd.merge(client_df, agg_trans, on='client_id', how='left')
    full_df['invoice_per_cooperation'] = full_df['transactions_count'] / full_df['coop_time']
    return full_df

def new_features(df):
    for col in agg_stat_columns:
        df[col+'_range'] = df[col+'_max'] - df[col+'_min']
        df[col+'_max_mean'] = df[col+'_max']/df[col+'_mean']
    return df


def drop(df,col_drop):
    for col in col_drop:
        df.drop([col], axis=1, inplace=True)
    return df

def imputeNaN(df, impute_columns):
    for col in impute_columns:
        median = df[col].median()
        df[col] = df[col].replace(to_replace=np.nan, value=median)
    return df

def imputeZeros(df, impute_columns):
    for col in impute_columns:
        median = df[col].median()
        df[col] = df[col].replace(to_replace=0, value=median)
    return df

 client_data_feature_prepared,invoice_data_feature_prepared= feature_preparation(client_data,invoice_data)
client_data_feature_prepared.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135493 entries, 0 to 135492
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   disrict        135493 non-null  category      
 1   client_id      135493 non-null  object        
 2   client_catg    135493 non-null  category      
 3   region         135493 non-null  category      
 4   creation_date  135493 non-null  datetime64[ns]
 5   target         135493 non-null  float64       
 6   region_group   135493 non-null  int64         
 7   coop_time      135493 non-null  int64         
dtypes: category(3), datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 5.6+ MB

 client_data_feature_prepared.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135493 entries, 0 to 135492
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   disrict        135493 non-null  category      
 1   client_id      135493 non-null  object        
 2   client_catg    135493 non-null  category      
 3   region         135493 non-null  category      
 4   creation_date  135493 non-null  datetime64[ns]
 5   target         135493 non-null  float64       
 6   region_group   135493 non-null  int64         
 7   coop_time      135493 non-null  int64         
dtypes: category(3), datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 5.6+ MB

 count_no_fraud, count_fraud = client_data_feature_prepared['target'].value_counts()
print(f'Relación de No Fraude: {count_no_fraud} /  Fraude: {count_fraud}')

no_fraud_client_data = client_data_feature_prepared[client_data_feature_prepared['target'] == 0]
fraud_client_data = client_data_feature_prepared[client_data_feature_prepared['target'] == 1]

Relación de No Fraude: 127927 /  Fraude: 7566

 client_data_feature_prepared_targets = client_data_feature_prepared['target']
client_data_feature_prepared_targets.hist()

<AxesSubplot:>

 client_data_feature_prepared = pd.concat([fraud_client_data.sample(count_fraud),no_fraud_client_data.sample(count_fraud)])
client_data_feature_prepared.shape

(15132, 8)

 client_data_feature_prepared_targets = client_data_feature_prepared['target']
client_data_feature_prepared_targets.hist()

<AxesSubplot:>

 client_data_feature_prepared.isnull().sum()

disrict          0
client_id        0
client_catg      0
region           0
creation_date    0
target           0
region_group     0
coop_time        0
dtype: int64

 invoice_data_feature_prepared.isnull().sum()

client_id               0
invoice_date            0
tarif_type              0
counter_number          0
counter_statue          0
counter_code            0
reading_remarque        0
counter_coefficient     0
consommation_level_1    0
consommation_level_2    0
consommation_level_3    0
consommation_level_4    0
old_index               0
new_index               0
months_number           0
counter_type            0
invoice_month           0
invoice_year            0
is_weekday              0
delta_index             0
dtype: int64

 impute_columns = [
    'counter_type',
    'delta_index'
]

invoice_data_feature_prepared = imputeNaN(invoice_data_feature_prepared, impute_columns)

 client_data_feature_prepared.isnull().sum()

disrict          0
client_id        0
client_catg      0
region           0
creation_date    0
target           0
region_group     0
coop_time        0
dtype: int64

 invoice_data_feature_prepared.isnull().sum()

client_id               0
invoice_date            0
tarif_type              0
counter_number          0
counter_statue          0
counter_code            0
reading_remarque        0
counter_coefficient     0
consommation_level_1    0
consommation_level_2    0
consommation_level_3    0
consommation_level_4    0
old_index               0
new_index               0
months_number           0
counter_type            0
invoice_month           0
invoice_year            0
is_weekday              0
delta_index             0
dtype: int64

 (client_data_feature_prepared == 0).astype(int).sum(axis=0)

disrict             0
client_id           0
client_catg         0
region              0
creation_date       0
target           7566
region_group        0
coop_time          19
dtype: int64

 impute_columns = [
    'coop_time'
]

client_data_feature_prepared = imputeZeros(client_data_feature_prepared, impute_columns)

 (invoice_data_feature_prepared == 0).astype(int).sum(axis=0)

client_id                     0
invoice_date                  0
tarif_type                    0
counter_number            43161
counter_statue          4379021
counter_code                 33
reading_remarque              0
counter_coefficient          46
consommation_level_1     467553
consommation_level_2    3816179
consommation_level_3    4293391
consommation_level_4    4383791
old_index                287906
new_index                195931
months_number                 2
counter_type            1397343
invoice_month                 0
invoice_year                  0
is_weekday              3946096
delta_index              472683
dtype: int64

 impute_columns = [
    'counter_number',
    'counter_coefficient',
    'months_number',
    'consommation_level_1',
    'consommation_level_2',
    'consommation_level_3',
    'consommation_level_4',
    'old_index',
    'new_index'
]

invoice_data_feature_prepared = imputeZeros(invoice_data_feature_prepared, impute_columns)

 (client_data_feature_prepared == 0).astype(int).sum(axis=0)

disrict             0
client_id           0
client_catg         0
region              0
creation_date       0
target           7566
region_group        0
coop_time           0
dtype: int64

 (invoice_data_feature_prepared == 0).astype(int).sum(axis=0)

client_id                     0
invoice_date                  0
tarif_type                    0
counter_number                0
counter_statue          4379021
counter_code                 33
reading_remarque              0
counter_coefficient           0
consommation_level_1          0
consommation_level_2    3816179
consommation_level_3    4293391
consommation_level_4    4383791
old_index                     0
new_index                     0
months_number                 0
counter_type            1397343
invoice_month                 0
invoice_year                  0
is_weekday              3946096
delta_index              472683
dtype: int64

 agg_stat_columns = [
 'tarif_type',
 'counter_number',
 'counter_statue',
 'counter_code',
 'reading_remarque',
 'consommation_level_1',
 'consommation_level_2',
 'consommation_level_3',
 'consommation_level_4',
 'old_index',
 'new_index',
 'months_number',
 'counter_type',
 'invoice_month',
 'invoice_year',
 'delta_index'
]

client_invoice_aggregated_data = agg_feature(invoice_data_feature_prepared, client_data_feature_prepared, agg_stat_columns)

 client_invoice_aggregated_data.head(n=10)

 client_invoice_aggregated_data.shape

(15132, 79)

 client_invoice_aggregated_data_new_features = new_features(client_invoice_aggregated_data)
client_invoice_aggregated_data_new_features.shape

(15132, 111)

 col_drop = ['client_id', 'creation_date','reading_remarque_max','counter_statue_min','counter_type_min','counter_type_max','counter_type_range',
          'tarif_type_max', 'delta_index_min', 'consommation_level_4_mean']
client_invoice_data_full = drop(client_invoice_aggregated_data_new_features,col_drop)
client_invoice_data_full.shape

(15132, 101)

 profileAggregatedData = ProfileReport(client_invoice_data_full, minimal=True, title="Fraude - Dataset Cliente y Facturacion")
profileAggregatedData.to_widgets()

Summarize dataset: 100%|██████████| 107/107 [00:01<00:00, 86.26it/s, Completed] 
Generate report structure: 100%|██████████| 1/1 [00:24<00:00, 24.20s/it]

 client_invoice_target = client_invoice_data_full['target']
client_invoice_data = client_invoice_data_full.drop('target', axis='columns')

 def train_validate_catboost(train_data, train_target, folds=10):
    # prepare cross validation
    kfold = KFold(n_splits=folds, shuffle= True, random_state=None)
    model = CatBoostClassifier(
        iterations=5,
        learning_rate=0.1,
        #loss_function='CrossEntropy'
        )


    acc_score = []
    fold_counter = 0
    # enumerate splits
    for train_index, test_index in kfold.split(train_data):
        X_train , X_test = train_data.iloc[train_index,:],train_data.iloc[test_index,:]
        y_train , y_test = train_target[train_index] , train_target[test_index]

        model.fit(X_train, y_train, 
            cat_features=['disrict','client_catg','region'], 
            eval_set=(X_test, y_test), 
            verbose=False
        )
        pred_values = model.predict(X_test)

        acc = accuracy_score(pred_values , y_test)
        acc_score.append(acc)
        print(f'Fold: {fold_counter} with {acc}%')
        fold_counter=fold_counter+1

    avg_acc_score = sum(acc_score)/folds
    print('CatBoost model is fitted: ' + str(model.is_fitted()))
    print(f'CatBoost Avg {avg_acc_score}')
    print(model.get_params())

 train_validate_catboost(client_invoice_data.copy(),client_invoice_target.copy())

Fold: 0 with 0.7344782034346103%
Fold: 1 with 0.7318361955085865%
Fold: 2 with 0.7382683410442828%
Fold: 3 with 0.7468605419695968%
Fold: 4 with 0.7349636483807006%
Fold: 5 with 0.7349636483807006%
Fold: 6 with 0.7164573694646398%
Fold: 7 with 0.7276933245208196%
Fold: 8 with 0.7349636483807006%
Fold: 9 with 0.7157964309319234%
CatBoost model is fitted: True
CatBoost Avg 0.731628135201656
{'iterations': 5, 'learning_rate': 0.1}

 client_invoice_data_full.shape

(15132, 101)

 clf1 = setup (client_invoice_data_full,

             # TARGET
             target = 'target',

             # SAMPLING DATOS
             train_size= 0.70,
             fix_imbalance = True,

             # TIPOS DE VARIABLES
             categorical_features = None,
             ordinal_features = None,
             numeric_features = None,

             # IMPUTACION NULOS
             categorical_imputation = 'mode',
             numeric_imputation = 'mean',

             # REDUCCION DE LA DIMENSIONALIDAD
             remove_outliers = False,
             ignore_low_variance = False,
             remove_multicollinearity = False,
             feature_selection = False,
             pca = False,

             # FEATURE ENGINEERING
             normalize = False,
             transformation = False,
             polynomial_features = False,
             trigonometry_features = False,
             feature_interaction = False,
             feature_ratio = False,

              #MLFLOW
             log_experiment = True,
             experiment_name = 'Electrical Fraud Tunisia',
             log_plots = True,
             log_profile = True,
             log_data = True,
             silent = True
           )

 best_model = compare_models()

 tuned_best_model = tune_model(best_model)

 evaluate_model(tuned_best_model)

 plot_model(tuned_best_model)

 catboost_model = create_model('catboost')

 interpret_model(catboost_model,plot='summary', save=True, )
interpret_model(catboost_model,plot='correlation', save=True)

 plot_model(catboost_model)

 plot_model(catboost_model, plot = 'confusion_matrix')

 plot_model(catboost_model, plot = 'auc')

 plot_model(catboost_model, plot = 'feature')

 rf_model = create_model('rf')

 interpret_model(rf_model,plot='summary', save=True)
interpret_model(rf_model,plot='correlation', save=True)

 evaluate_model(rf_model)

 plot_model(rf_model, plot = 'auc')

 plot_model(rf_model, plot = 'feature')

 plot_model(rf_model, plot = 'confusion_matrix')

 end = time.time()
print(end - start, "secs")

4585.058943271637 secs

Nombre Columna	Descripcion
Client_id	ID único de cliente
District	Distrito al que el cliente pertenece
Client_catg	Categoría de cliente
Region	Región a la que el cliente pertenece
Creation_date	Fecha de ingreso del cliente
Target	Variable de salida que indica con un 1 si es fraude y 0 si no lo es

Nombre Columna	Descripcion
Client_id	ID único de cliente, relación con la table cliente
Invoice_date	Fecha de facturación
Tarif_type	Tipo de impuesto
Counter_number	número del medidor
Counter_statue	estado del medidor, toma hasta 5 valores: working fine, not working, on hold statue, etc.
Counter_code	Código de medidor
Reading_remarque	Notas que toma el agente de STEG cuando visita al cliente (por ejemplo: Si el medidor muestra incorrecto, el agente da razón)
Counter_coefficient	Un coeficiente adicional a ser sumado cuando el consumo estándar es excedido
Consommation_level_1	Nivel de consumo 1
Consommation_level_2	Nivel de consumo 2
Consommation_level_3	Nivel de consumo 3
Consommation_level_4	Nivel de consumo 4
Old_index	Índice viejo
New_index	Índice viejo
Months_number	número de meses
Counter_type	Tipo de medidor

	disrict	client_id	client_catg	region	creation_date	target	region_group	coop_time	is_weekday_mean	transactions_count	tarif_type_mean	tarif_type_std	tarif_type_min	tarif_type_max	counter_number_mean	counter_number_std	counter_number_min	counter_number_max	counter_statue_mean	counter_statue_std	counter_statue_max	counter_code_mean	counter_code_std	counter_code_min	counter_code_max	reading_remarque_mean	reading_remarque_std	reading_remarque_min	reading_remarque_max	consommation_level_1_mean	consommation_level_1_std	consommation_level_1_min	consommation_level_1_max	consommation_level_2_mean	consommation_level_2_std	consommation_level_2_max	consommation_level_3_mean	consommation_level_3_std	consommation_level_3_max	consommation_level_4_mean	consommation_level_4_std	consommation_level_4_max	old_index_mean	old_index_std	old_index_min	old_index_max	new_index_mean	new_index_std	new_index_min	new_index_max	months_number_mean	months_number_std	months_number_min	months_number_max	counter_type_mean	counter_type_std	counter_type_min	counter_type_max	invoice_month_mean	invoice_month_std	invoice_month_min	invoice_month_max	invoice_year_mean	invoice_year_std	invoice_year_min	invoice_year_max	delta_index_mean	delta_index_std	delta_index_min	delta_index_max	delta_time_mean	delta_time_std	delta_time_min	delta_time_max	invoice_per_cooperation
0	63	train_Client_63274	11	313	2002-10-05	1.0	300	194	0.172414	58	22.500000	14.310161	11	40	1.873182e+12	6.942619e+12	203184	27161135368900	0.189655	0.712222	5	124.482759	97.703856	5	203	8.275862	1.166688	6	9	480.155172	414.513911	26	1600	213.896552	564.734954	2770	59.465517	199.958260	800	70.017241	376.435521	2601	11039.758621	11562.132615	46	35713	11592.603448	12084.558533	46	39576	3.758621	0.923577	2	8	0.603448	0.493454	0	1	6.982759	2.886699	2	12	2013.034483	3.622375	2005	2019	809.362069	1102.699442	0	4601	87.947368	96.795517	0.0	309.0	0.298969
1	69	train_Client_1454	11	103	2011-05-09	1.0	200	91	0.086957	46	25.500000	14.660226	11	40	2.268395e+05	1.143199e+05	113769	339910	0.000000	0.000000	0	106.000000	102.116056	5	207	8.195652	1.046042	6	9	565.369565	333.231608	56	1600	125.173913	380.100589	2379	66.739130	206.515583	1051	0.000000	0.000000	0	7628.021739	6098.347268	176	22853	8050.956522	6692.106979	176	26304	4.173913	0.824738	4	8	0.500000	0.505525	0	1	4.434783	2.344281	1	8	2015.347826	2.243186	2012	2019	757.282609	744.658719	56	3579	56.977778	79.332009	0.0	364.0	0.505495
2	60	train_Client_99869	11	101	1980-01-29	1.0	200	467	0.000000	29	11.000000	0.000000	11	11	4.682559e+04	8.924447e+03	42823	66038	0.206897	0.940338	5	203.000000	0.000000	203	203	7.448276	1.453720	6	9	838.344828	383.313307	138	2025	88.793103	196.914662	800	87.275862	237.630099	800	36.103448	145.148285	723	17822.172414	12669.788830	2530	42883	18598.068966	13144.087481	499	43623	4.620690	1.859604	2	8	1.000000	0.000000	1	1	8.827586	3.094569	4	12	2012.206897	4.195059	2005	2019	1041.068966	698.496443	0	2723	178.428571	137.886810	0.0	607.0	0.062099
3	62	train_Client_67084	11	305	1985-09-30	1.0	300	399	0.317073	41	10.000000	0.000000	10	10	5.372540e+05	0.000000e+00	537254	537254	0.000000	0.000000	0	202.000000	0.000000	202	202	7.317073	1.404262	6	9	180.000000	63.480706	8	320	208.414634	276.439774	1192	0.000000	0.000000	0	0.000000	0.000000	0	42584.804878	4410.007360	32070	47512	42966.536585	4146.599487	33462	47721	4.097561	0.624695	4	8	1.000000	0.000000	1	1	6.512195	2.899327	1	12	2012.365854	4.035815	2006	2019	381.731707	303.961101	0	1392	125.425000	99.738347	4.0	338.0	0.102757
4	69	train_Client_44587	11	103	2015-12-14	1.0	200	36	0.222222	18	25.500000	14.920377	11	40	2.636912e+06	2.307259e+06	394659	4879164	0.000000	0.000000	0	106.000000	103.928143	5	207	8.888889	0.323381	8	9	408.500000	376.311706	33	1371	39.833333	142.483952	600	6.944444	29.462783	125	0.000000	0.000000	0	3046.666667	2492.987081	453	7690	2647.500000	2134.033195	453	7457	4.666667	1.371989	4	8	0.500000	0.514496	0	1	6.888889	3.160210	3	11	2017.444444	1.096638	2016	2019	455.277778	493.879695	33	1925	77.235294	121.026614	0.0	365.0	0.500000
5	60	train_Client_76177	11	101	1989-10-31	1.0	200	350	0.125000	64	22.187500	14.850445	10	40	2.546354e+12	7.979519e+12	224083	27161100358950	0.015625	0.125000	1	121.968750	97.517922	5	202	6.812500	1.166667	6	9	223.093750	110.356668	19	800	48.625000	81.228562	296	0.000000	0.000000	0	0.000000	0.000000	0	12081.250000	7512.851263	61	22523	12176.062500	7733.342602	28	22741	4.156250	1.086990	2	10	0.593750	0.495015	0	1	5.390625	3.439348	1	12	2012.281250	3.856426	2005	2019	186.093750	188.599157	0	962	80.873016	99.941352	0.0	483.0	0.182857
6	62	train_Client_65840	11	301	1991-11-07	1.0	300	325	0.107143	56	25.500000	14.631224	11	40	1.167172e+11	4.950179e+11	28612	2178702362600	0.000000	0.000000	0	209.000000	205.846191	5	413	6.589286	1.156433	6	9	663.017857	458.876275	274	1938	180.678571	316.259221	1372	88.446429	273.315729	1600	70.553571	280.529689	1672	29853.750000	27454.247338	2672	86391	30833.125000	28047.743531	2672	90696	4.214286	1.357997	2	8	0.500000	0.504525	0	1	6.571429	3.132569	3	12	2011.482143	4.352600	2005	2019	865.696429	1043.549170	0	4305	89.490909	129.470842	0.0	545.0	0.172308
7	69	train_Client_91850	11	103	2008-08-20	1.0	200	124	0.107692	65	25.276923	14.611113	11	40	1.309658e+05	7.330546e+04	59337	204833	0.030769	0.174036	1	107.553846	101.773958	5	207	7.661538	1.337873	6	9	281.569231	299.003395	4	1087	16.707692	72.245831	400	6.000000	33.982992	205	0.000000	0.000000	0	3763.461538	4294.850030	12	14867	3838.138462	4504.876094	12	15954	3.907692	0.551222	2	6	0.507692	0.503831	0	1	4.692308	2.410414	1	8	2013.584615	3.176537	2008	2019	295.846154	356.586255	0	1405	60.078125	79.210049	0.0	364.0	0.524194
8	62	train_Client_82968	11	301	1977-09-27	1.0	300	495	0.154930	71	23.647887	14.920837	10	40	2.135241e+11	6.502184e+11	169049	2165704967900	0.000000	0.000000	0	113.338028	98.833907	5	203	7.014085	1.236273	6	9	296.408451	214.022000	75	900	155.394366	221.958457	790	54.323944	188.735323	800	10.338028	60.290711	476	27687.915493	26816.976330	170	65690	27987.760563	27061.316366	170	65895	4.028169	0.792317	2	8	0.549296	0.501105	0	1	6.873239	3.549364	1	12	2012.647887	3.745849	2005	2019	516.464789	503.418849	75	2476	72.557143	96.459443	0.0	395.0	0.143434
9	63	train_Client_112168	11	306	2013-11-04	1.0	300	61	0.117647	17	11.000000	0.000000	11	11	5.946480e+05	0.000000e+00	594648	594648	0.058824	0.242536	1	413.000000	0.000000	413	413	9.000000	0.000000	9	9	770.529412	343.403210	238	1924	239.647059	198.298053	400	468.058824	403.787145	800	827.117647	1166.074123	3877	12586.529412	10256.401727	1924	33314	14439.529412	12011.845519	1924	39191	4.470588	1.328422	4	8	1.000000	0.000000	1	1	5.294118	3.349715	1	11	2016.294118	1.794600	2013	2019	2305.352941	1673.588545	238	5877	134.125000	75.176127	15.0	304.0	0.278689

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
catboost	CatBoost Classifier	0.7836	0.8674	0.7866	0.7828	0.7846	0.5672	0.5673	7.2830
lightgbm	Light Gradient Boosting Machine	0.7785	0.8623	0.7841	0.7764	0.7801	0.5570	0.5572	0.5830
xgboost	Extreme Gradient Boosting	0.7685	0.8537	0.7758	0.7655	0.7705	0.5370	0.5372	2.9460
gbc	Gradient Boosting Classifier	0.7670	0.8540	0.7752	0.7637	0.7692	0.5340	0.5343	3.4410
rf	Random Forest Classifier	0.7664	0.8509	0.7696	0.7657	0.7675	0.5329	0.5331	0.9790
et	Extra Trees Classifier	0.7542	0.8389	0.7590	0.7529	0.7558	0.5085	0.5086	0.8530
ada	Ada Boost Classifier	0.7487	0.8299	0.7634	0.7424	0.7526	0.4973	0.4977	0.9040
lda	Linear Discriminant Analysis	0.7200	0.7970	0.7319	0.7159	0.7237	0.4399	0.4403	0.3470
knn	K Neighbors Classifier	0.6803	0.7403	0.6577	0.6905	0.6734	0.3607	0.3615	0.3930
dt	Decision Tree Classifier	0.6774	0.6774	0.6758	0.6790	0.6772	0.3548	0.3550	0.4040
svm	SVM - Linear Kernel	0.6572	0.0000	0.6743	0.6754	0.6524	0.3144	0.3315	0.3190
qda	Quadratic Discriminant Analysis	0.6173	0.6903	0.8497	0.5830	0.6870	0.2338	0.2767	0.2580
ridge	Ridge Classifier	0.5774	0.0000	0.3828	0.7499	0.4247	0.1554	0.2311	0.2910
nb	Naive Bayes	0.5763	0.6711	0.1722	0.9044	0.2880	0.1541	0.2616	0.2290
lr	Logistic Regression	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.2620

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.7840	0.8698	0.8023	0.7745	0.7882	0.5679	0.5683
1	0.7717	0.8637	0.7966	0.7594	0.7776	0.5434	0.5440
2	0.7762	0.8588	0.7759	0.7774	0.7766	0.5524	0.5524
3	0.7885	0.8693	0.7702	0.8004	0.7850	0.5770	0.5774
4	0.7866	0.8665	0.7891	0.7861	0.7876	0.5732	0.5732
5	0.7686	0.8385	0.7853	0.7609	0.7729	0.5373	0.5375
6	0.7790	0.8663	0.7891	0.7745	0.7817	0.5580	0.5581
7	0.7639	0.8507	0.7646	0.7646	0.7646	0.5279	0.5279
8	0.7724	0.8649	0.7792	0.7691	0.7741	0.5448	0.5449
9	0.7847	0.8779	0.7830	0.7860	0.7845	0.5694	0.5694
Mean	0.7776	0.8626	0.7835	0.7753	0.7793	0.5551	0.5553
SD	0.0079	0.0105	0.0109	0.0122	0.0071	0.0158	0.0158

Marcelino Mayorga Quesada ¶

El caso del fraude eléctrico¶

Consideraciones¶

Definición del Dataset Fuente¶

Guía práctica del proceso de construcción, entrenamiento y validación del dataset¶

Importamos de librerías necesarias¶

Data Acquisition¶

Data Comprehension¶

Generación de Reportes (Univariante y Multivariante)¶

Observaciones y siguientes pasos¶

Data Preparation (Pre-procesado de datos)¶

Definición de Funciones Utilitarias¶

Data Clean¶

Preparación Inicial del dataset¶

Data Balancing¶

Imputación de Nulos¶

Imputación de Ceros¶

Agregación de data del Cliente y Facturación.¶

Feature Engineering¶

Feature Creation¶

Feature Elimination¶

Observamos el reporte generado con el nuevo dataset creado.¶

Data Training¶

Optimización con Pycaret¶

Ejecución del Optimizador (Pycaret)¶

Validación y Métricas¶

Validación de resultados de Pycaret¶

Interpretación del modelo creado (Catboost)¶

Validación utilizando Random Forest (mismos pasos)¶

	Description	Value
0	session_id	489
1	Target	target
2	Target Type	Binary
3	Label Encoded	0.0: 0, 1.0: 1
4	Original Data	(15132, 101)
5	Missing Values	True
6	Numeric Features	86
7	Categorical Features	14
8	Ordinal Features	False
9	High Cardinality Features	False
10	High Cardinality Method	None
11	Transformed Train Set	(10592, 203)
12	Transformed Test Set	(4540, 203)
13	Shuffle Train-Test	True
14	Stratify Train-Test	False
15	Fold Generator	StratifiedKFold
16	Fold Number	10
17	CPU Jobs	-1
18	Use GPU	False
19	Log Experiment	True
20	Experiment Name	Electrical Fraud Tunisia
21	USI	4c7d
22	Imputation Type	simple
23	Iterative Imputation Iteration	None
24	Numeric Imputer	mean
25	Iterative Imputation Numeric Model	None
26	Categorical Imputer	mode
27	Iterative Imputation Categorical Model	None
28	Unknown Categoricals Handling	least_frequent
29	Normalize	False
30	Normalize Method	None
31	Transformation	False
32	Transformation Method	None
33	PCA	False
34	PCA Method	None
35	PCA Components	None
36	Ignore Low Variance	False
37	Combine Rare Levels	False
38	Rare Level Threshold	None
39	Numeric Binning	False
40	Remove Outliers	False
41	Outliers Threshold	None
42	Remove Multicollinearity	False
43	Multicollinearity Threshold	None
44	Clustering	False
45	Clustering Iteration	None
46	Polynomial Features	False
47	Polynomial Degree	None
48	Trignometry Features	False
49	Polynomial Threshold	None
50	Group Features	False
51	Feature Selection	False
52	Features Selection Threshold	None
53	Feature Interaction	False
54	Feature Ratio	False
55	Interaction Threshold	None
56	Fix Imbalance	True
57	Fix Imbalance Method	SMOTE

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.7925	0.8768	0.7966	0.7907	0.7936	0.5849	0.5849
1	0.7858	0.8714	0.8060	0.7754	0.7904	0.5717	0.5721
2	0.7819	0.8667	0.7702	0.7896	0.7798	0.5638	0.5639
3	0.7951	0.8709	0.7815	0.8043	0.7927	0.5902	0.5904
4	0.7809	0.8707	0.7834	0.7805	0.7820	0.5618	0.5618
5	0.7677	0.8431	0.7665	0.7694	0.7679	0.5354	0.5354
6	0.7819	0.8647	0.7891	0.7788	0.7839	0.5637	0.5638
7	0.7734	0.8571	0.7834	0.7689	0.7761	0.5467	0.5468
8	0.7847	0.8699	0.7943	0.7796	0.7869	0.5694	0.5695
9	0.7923	0.8825	0.7943	0.7914	0.7928	0.5845	0.5845
Mean	0.7836	0.8674	0.7866	0.7828	0.7846	0.5672	0.5673
SD	0.0081	0.0103	0.0115	0.0105	0.0080	0.0163	0.0163

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.7670	0.8586	0.7891	0.7563	0.7724	0.5339	0.5344
1	0.7698	0.8509	0.7947	0.7576	0.7757	0.5396	0.5402
2	0.7686	0.8497	0.7627	0.7729	0.7678	0.5373	0.5374
3	0.7630	0.8523	0.7363	0.7789	0.7570	0.5260	0.5268
4	0.7724	0.8549	0.7759	0.7715	0.7737	0.5448	0.5449
5	0.7365	0.8297	0.7288	0.7414	0.7350	0.4731	0.4732
6	0.7771	0.8505	0.7740	0.7799	0.7769	0.5543	0.5543
7	0.7611	0.8431	0.7665	0.7593	0.7629	0.5222	0.5222
8	0.7677	0.8531	0.7792	0.7620	0.7705	0.5354	0.5355
9	0.7809	0.8659	0.7887	0.7770	0.7828	0.5618	0.5619
Mean	0.7664	0.8509	0.7696	0.7657	0.7675	0.5329	0.5331
SD	0.0115	0.0091	0.0209	0.0118	0.0128	0.0229	0.0229

Marcelino Mayorga Quesada¶

El caso del fraude eléctrico¶

Consideraciones¶

Definición del Dataset Fuente¶

Guía práctica del proceso de construcción, entrenamiento y validación del dataset¶

Importamos de librerías necesarias¶

Data Acquisition¶

Data Comprehension¶

Generación de Reportes (Univariante y Multivariante)¶

Observaciones y siguientes pasos¶

Data Preparation (Pre-procesado de datos)¶

Definición de Funciones Utilitarias¶

Data Clean¶

Preparación Inicial del dataset¶

Data Balancing¶

Imputación de Nulos¶

Imputación de Ceros¶

Agregación de data del Cliente y Facturación.¶

Feature Engineering¶

Feature Creation¶

Feature Elimination¶

Observamos el reporte generado con el nuevo dataset creado.¶

Data Training¶

Optimización con Pycaret¶

Ejecución del Optimizador (Pycaret)¶

Validación y Métricas¶

Validación de resultados de Pycaret¶

Interpretación del modelo creado (Catboost)¶

Validación utilizando Random Forest (mismos pasos)¶

Marcelino Mayorga Quesada ¶