Ensemble Methods Comparison: Bagging, Boosting, and Stacking Techniques

Bagging Classifier Implementation

Base Model Performance

base_model = DecisionTreeClassifier(random_state=42)
base_model.fit(X_train, y_train)

y_pred_base = base_model.predict(X_test)
base_recall = recall_score(y_test, y_pred_base)
print("Recall del modelo base: {:.4f}".format(base_recall))

Hyperparameter Tuning (Grid Search)

param_grid = {
    "n_estimators": [10, 50, 100],
    "max_samples": [0.5, 0.8, 1.0],
    "max_features": [0.5, 0.8, 1.0],
    "bootstrap": [True]
}

bagging = BaggingClassifier(estimator=base_model, random_state=42)

grid_search = GridSearchCV(estimator=bagging,
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring="recall")
grid_search.fit(X_train, y_train)

Results and Comparison

print(grid_search.best_params_)
print("Mejor Recall en CV: {:.4f}".format(grid_search.best_score_))

best_bagging = grid_search.best_estimator_
y_pred_bagging = best_bagging.predict(X_test)
bagging_recall = recall_score(y_test, y_pred_bagging)
print("\nRecall en test del Bagging: {:.4f}".format(bagging_recall))

if bagging_recall > base_recall:
    print("El Bagging mejora el modelo base.")

XGBoost Classifier Implementation

Model Definition and Training

xgb = XGBClassifier(
    n_estimators=100, learning_rate=0.1,
    max_depth=3, subsample=0.8,
    colsample_bytree=0.8, gamma=0, reg_alpha=0,
    reg_lambda=1, eval_metric='logloss', random_state=42)

xgb.fit(X_train_tr, y_train)
y_pred = xgb.predict(X_test_tr)

Evaluation Results

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

AdaBoost Classifier Implementation

Base Estimator and Model Setup

base_model = LogisticRegression(solver='liblinear', penalty='l2')
ada = AdaBoostClassifier(estimator=base_model, random_state=42)

Hyperparameter Tuning

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'estimator__C': [0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(estimator=ada, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1)
grid_search.fit(X_train_tr, y_train)

Optimized Model Results

print("Mejores parámetros:", grid_search.best_params_)
print("Mejor Recall en CV:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_tr)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Stacking Classifier Implementation

Defining Base and Meta Models

base_models_1 = [
    ('bagging', BaggingClassifier(estimator=DecisionTreeClassifier(), random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('ada', AdaBoostClassifier(estimator=LogisticRegression(solver='liblinear'), random_state=42))
]

meta_model = LogisticRegression(solver='liblinear')
modelo_apilado = StackingClassifier(estimators=base_models_1, final_estimator=meta_model, cv=5)

Tuning the Stacking Model

param_grid = {
    'final_estimator__C': [0.1, 1],
}

grid_search = GridSearchCV(estimator=modelo_apilado, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1)
grid_search.fit(X_train_tr, y_train)

Evaluation Results

y_pred = grid_search.predict(X_test)

sensibilidad = recall_score(y_test, y_pred)
print(f"Sensibilidad del clasificador de apilamiento optimizado: {sensibilidad:.2f}")
print(f"Mejores parámetros encontrados: {grid_search.best_params_}")

Gradient Boosting Implementation

Initial Model Training

gb_model = GradientBoostingClassifier(min_samples_split=2, max_depth=2, n_estimators=5, learning_rate=0.1)
gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)
print("Accuracy (Gradient Boosting):", accuracy_score(y_test, y_pred_gb))
print("Importancia de las características:", gb_model.feature_importances_)

Hyperparameter Optimization (Grid Search)

param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1, 0.3],
    'max_depth': [1, 3, 5],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

Optimized Model Evaluation

print("Mejores hiperparámetros:", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))