diff --git a/src/figures/Confusion matrix (tuned) – scale + mlp.png b/src/figures/Confusion matrix (tuned) – scale + mlp.png new file mode 100644 index 0000000..5e1e5c8 Binary files /dev/null and b/src/figures/Confusion matrix (tuned) – scale + mlp.png differ diff --git a/src/figures/Confusion matrix (tuned) – scale + rf.png b/src/figures/Confusion matrix (tuned) – scale + rf.png new file mode 100644 index 0000000..f8322bd Binary files /dev/null and b/src/figures/Confusion matrix (tuned) – scale + rf.png differ diff --git a/src/figures/Confusion matrix (tuned) – scale + svm.png b/src/figures/Confusion matrix (tuned) – scale + svm.png new file mode 100644 index 0000000..2c98411 Binary files /dev/null and b/src/figures/Confusion matrix (tuned) – scale + svm.png differ diff --git a/src/figures/Confusion matrix (tuned) – scale_pca_85 + knn.png b/src/figures/Confusion matrix (tuned) – scale_pca_85 + knn.png new file mode 100644 index 0000000..76f4d2a Binary files /dev/null and b/src/figures/Confusion matrix (tuned) – scale_pca_85 + knn.png differ diff --git a/src/figures/tuning_results.csv b/src/figures/tuning_results.csv new file mode 100644 index 0000000..243f054 --- /dev/null +++ b/src/figures/tuning_results.csv @@ -0,0 +1,2 @@ +config,preprocess,model,params,mean_acc,std_acc +scale + svm,scale,svm,"{'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None}",0.8581719138625145,0.013348223889216927 diff --git a/src/labelsX.npy b/src/labelsX.npy new file mode 100644 index 0000000..4861c44 Binary files /dev/null and b/src/labelsX.npy differ diff --git a/src/labelsX_scale_mlp.npy b/src/labelsX_scale_mlp.npy new file mode 100644 index 0000000..1c825ff Binary files /dev/null and b/src/labelsX_scale_mlp.npy differ diff --git a/src/labelsX_scale_pca_85_knn.npy b/src/labelsX_scale_pca_85_knn.npy new file mode 100644 index 0000000..4490386 Binary files /dev/null and b/src/labelsX_scale_pca_85_knn.npy differ diff --git a/src/labelsX_scale_rf.npy b/src/labelsX_scale_rf.npy new file mode 100644 index 0000000..28c62f6 Binary files /dev/null and b/src/labelsX_scale_rf.npy differ diff --git a/src/labelsX_scale_svm.npy b/src/labelsX_scale_svm.npy new file mode 100644 index 0000000..4861c44 Binary files /dev/null and b/src/labelsX_scale_svm.npy differ diff --git a/src/output.txt b/src/output.txt new file mode 100644 index 0000000..a11dca4 --- /dev/null +++ b/src/output.txt @@ -0,0 +1,185 @@ +/home/hoo2/Work/AUTh/PatternRecognition/Assignment_2025-26/.venv/bin/python /home/hoo2/Work/AUTh/PatternRecognition/Assignment_2025-26/src/partD.py all +[ scale] [ gnb] val_acc=0.7095 +[ scale] [ rf] val_acc=0.8205 +[ scale] [ logreg] val_acc=0.7730 +[ scale] [linear_svm] val_acc=0.7707 +[ scale] [ svm] val_acc=0.8593 +[ scale] [ mlp] val_acc=0.8382 +[ scale] [ knn] val_acc=0.8342 +[ scale] [ adaboost] val_acc=0.6832 +[scale_pca_66] [ gnb] val_acc=0.7524 +[scale_pca_66] [ rf] val_acc=0.8096 +[scale_pca_66] [ logreg] val_acc=0.7862 +[scale_pca_66] [linear_svm] val_acc=0.7736 +[scale_pca_66] [ svm] val_acc=0.8582 +[scale_pca_66] [ mlp] val_acc=0.8359 +[scale_pca_66] [ knn] val_acc=0.8370 +[scale_pca_66] [ adaboost] val_acc=0.6878 +[scale_pca_75] [ gnb] val_acc=0.7547 +[scale_pca_75] [ rf] val_acc=0.8130 +[scale_pca_75] [ logreg] val_acc=0.7839 +[scale_pca_75] [linear_svm] val_acc=0.7696 +[scale_pca_75] [ svm] val_acc=0.8565 +[scale_pca_75] [ mlp] val_acc=0.8216 +[scale_pca_75] [ knn] val_acc=0.8370 +[scale_pca_75] [ adaboost] val_acc=0.6878 +[scale_pca_85] [ gnb] val_acc=0.7501 +[scale_pca_85] [ rf] val_acc=0.8033 +[scale_pca_85] [ logreg] val_acc=0.7810 +[scale_pca_85] [linear_svm] val_acc=0.7662 +[scale_pca_85] [ svm] val_acc=0.8588 +[scale_pca_85] [ mlp] val_acc=0.8188 +[scale_pca_85] [ knn] val_acc=0.8388 +[scale_pca_85] [ adaboost] val_acc=0.6998 + +=== Investigation summary === +model +svm 0.859348 +knn 0.838765 +mlp 0.838193 +rf 0.820469 +logreg 0.786164 +linear_svm 0.773585 +gnb 0.754717 +adaboost 0.699828 + +Selected top-3 models for further analysis: ['svm', 'knn', 'mlp'] + +Best configuration overall: preprocess=scale, model=svm, val_acc=0.8593 + +Classification report (best config): + precision recall f1-score support + + 1 0.94 0.96 0.95 354 + 2 0.76 0.73 0.75 344 + 3 0.92 0.93 0.93 351 + 4 0.91 0.91 0.91 343 + 5 0.75 0.77 0.76 357 + + accuracy 0.86 1749 + macro avg 0.86 0.86 0.86 1749 +weighted avg 0.86 0.86 0.86 1749 + + +[TUNING] scale + rf (cv=5) ... +[scale | rf] combo 1/1 mean=0.8228 params={'n_estimators': 400, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 1} + best mean_acc=0.8228 (std=0.0121) params={'n_estimators': 400, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 1} + +[TUNING] scale + mlp (cv=5) ... +[scale | mlp] combo 1/1 mean=0.8407 params={'hidden_layer_sizes': (128,), 'alpha': 0.001, 'learning_rate_init': 0.01, 'activation': 'relu', 'solver': 'adam'} + best mean_acc=0.8407 (std=0.0098) params={'hidden_layer_sizes': (128,), 'alpha': 0.001, 'learning_rate_init': 0.01, 'activation': 'relu', 'solver': 'adam'} + +[TUNING] scale_pca_85 + knn (cv=5) ... +[scale_pca_85 | knn] combo 1/1 mean=0.8313 params={'n_neighbors': 9, 'weights': 'distance', 'p': 2} + best mean_acc=0.8313 (std=0.0117) params={'n_neighbors': 9, 'weights': 'distance', 'p': 2} + +[TUNING] scale + svm (cv=5) ... +[scale | svm] combo 1/1 mean=0.8582 params={'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None} + best mean_acc=0.8582 (std=0.0133) params={'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None} + +=== Tuning summary (best overall) === +{'name': 'scale + svm', 'preprocess_spec': {'type': 'pipeline', 'steps': [{'type': 'scaler', 'params': {}}]}, 'preprocess_name': 'scale', 'model': 'svm', 'params': {'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None}, 'mean_acc': 0.8581719138625145, 'std_acc': 0.013348223889216927} + +============================================================ +[FINAL - VALIDATION] scale + rf +Confusion matrix: +[[338 6 5 3 2] + [ 4 239 11 12 78] + [ 11 2 316 21 1] + [ 3 12 17 299 12] + [ 13 71 4 9 260]] + +Classification report: + precision recall f1-score support + + 1 0.92 0.95 0.93 354 + 2 0.72 0.69 0.71 344 + 3 0.90 0.90 0.90 351 + 4 0.87 0.87 0.87 343 + 5 0.74 0.73 0.73 357 + + accuracy 0.83 1749 + macro avg 0.83 0.83 0.83 1749 +weighted avg 0.83 0.83 0.83 1749 + +============================================================ +[FINAL] scale_rf: saved labelsX_scale_rf.npy shape=(6955,) + +============================================================ +[FINAL - VALIDATION] scale + mlp +Confusion matrix: +[[338 1 9 2 4] + [ 5 244 13 7 75] + [ 10 3 320 16 2] + [ 0 14 16 302 11] + [ 8 74 1 16 258]] + +Classification report: + precision recall f1-score support + + 1 0.94 0.95 0.95 354 + 2 0.73 0.71 0.72 344 + 3 0.89 0.91 0.90 351 + 4 0.88 0.88 0.88 343 + 5 0.74 0.72 0.73 357 + + accuracy 0.84 1749 + macro avg 0.83 0.84 0.83 1749 +weighted avg 0.83 0.84 0.84 1749 + +============================================================ +[FINAL] scale_mlp: saved labelsX_scale_mlp.npy shape=(6955,) + +============================================================ +[FINAL - VALIDATION] scale_pca_85 + knn +Confusion matrix: +[[346 2 5 0 1] + [ 5 193 9 7 130] + [ 19 1 319 11 1] + [ 4 9 17 301 12] + [ 8 33 1 6 309]] + +Classification report: + precision recall f1-score support + + 1 0.91 0.98 0.94 354 + 2 0.81 0.56 0.66 344 + 3 0.91 0.91 0.91 351 + 4 0.93 0.88 0.90 343 + 5 0.68 0.87 0.76 357 + + accuracy 0.84 1749 + macro avg 0.85 0.84 0.84 1749 +weighted avg 0.85 0.84 0.84 1749 + +============================================================ +[FINAL] scale_pca_85_knn: saved labelsX_scale_pca_85_knn.npy shape=(6955,) + +============================================================ +[FINAL - VALIDATION] scale + svm +Confusion matrix: +[[340 2 8 1 3] + [ 3 251 9 6 75] + [ 7 1 327 14 2] + [ 0 12 9 311 11] + [ 11 63 1 8 274]] + +Classification report: + precision recall f1-score support + + 1 0.94 0.96 0.95 354 + 2 0.76 0.73 0.75 344 + 3 0.92 0.93 0.93 351 + 4 0.91 0.91 0.91 343 + 5 0.75 0.77 0.76 357 + + accuracy 0.86 1749 + macro avg 0.86 0.86 0.86 1749 +weighted avg 0.86 0.86 0.86 1749 + +============================================================ +[FINAL] scale_svm: saved labelsX_scale_svm.npy shape=(6955,) +Saved labels to labelsX.npy with shape (6955,) + +Process finished with exit code 0 + diff --git a/src/partA.py b/src/partA.py index 140851f..8e7f6b0 100644 --- a/src/partA.py +++ b/src/partA.py @@ -197,6 +197,17 @@ def plot_gaussians_3d( ax.set_zlabel("pdf") plt.show() + # plt.figure(figsize=(6, 5)) + # plt.scatter(X[:, 0], X[:, 1], s=10, alpha=0.35) + # plt.contour(Xgrid, Ygrid, Z, levels=8, linewidths=1.5) + # + # plt.title("Estimated Gaussian density (ML)") + # plt.xlabel("x₁") + # plt.ylabel("x₂") + # + # plt.tight_layout() + # plt.show() + # -------------------------------------------------- diff --git a/src/partB.py b/src/partB.py index c95bad9..c4c426a 100644 --- a/src/partB.py +++ b/src/partB.py @@ -302,7 +302,7 @@ def plot_histogram_with_pdf( plt.plot(x_plot, pdf_true, label=f"True N({mu_true}, {var_true}) pdf") plt.xlabel("x") plt.ylabel("Density") - plt.title("Dataset2 histogram vs true N({mu_true}, {var_true}) pdf") + plt.title(f"Dataset2 histogram vs true N({mu_true}, {var_true}) pdf") plt.legend() plt.grid(True) plt.show() diff --git a/src/partD.py b/src/partD.py index 22ad12c..067e54d 100644 --- a/src/partD.py +++ b/src/partD.py @@ -33,6 +33,8 @@ import matplotlib as mpl import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold + from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA @@ -389,8 +391,9 @@ def plot_accuracy_bars(df: pd.DataFrame, title: str) -> None: plt.grid(True, axis="y", alpha=0.3) plt.legend() plt.tight_layout() - plt.show(block=False) plt.savefig(f"figures/" + title + ".png", dpi=300) + plt.show(block=False) + plt.pause(2) plt.close() @@ -404,8 +407,9 @@ def plot_confusion(y_true: np.ndarray, y_pred: np.ndarray, title: str) -> None: disp.plot(ax=ax, cmap="Blues", colorbar=True) ax.set_title(title) plt.tight_layout() - plt.show(block=False) plt.savefig(f"figures/" + title + ".png", dpi=300) + plt.show(block=False) + plt.pause(2) plt.close() @@ -457,7 +461,7 @@ def plot_pca_scatter_2d( bbox_inches="tight", ) plt.show(block=False) - plt.pause(0.001) + plt.pause(2) plt.close() @@ -497,7 +501,7 @@ def plot_feature_separability( bbox_inches="tight", ) plt.show(block=False) - plt.pause(0.001) + plt.pause(2) plt.close() # Plot worst @@ -514,7 +518,7 @@ def plot_feature_separability( bbox_inches="tight", ) plt.show(block=False) - plt.pause(0.001) + plt.pause(2) plt.close() return best_idx, worst_idx @@ -554,7 +558,7 @@ def plot_feature_distributions_grid( bbox_inches="tight", ) plt.show(block=False) - plt.pause(0.001) + plt.pause(2) plt.close() @@ -711,6 +715,16 @@ def final_training_for_all_best_configs( model = train_classifier(X_tr_p, y_tr, model_spec) y_val_pred = model.predict(X_val_p).astype(int) + # --- console output: confusion matrix + report --- + cm = confusion_matrix(y_val, y_val_pred) + print("\n" + "=" * 60) + print(f"[FINAL - VALIDATION] {preprocess_name} + {model_key}") + print("Confusion matrix:") + print(cm) + print("\nClassification report:") + print(classification_report(y_val, y_val_pred)) + print("=" * 60) + plot_confusion( y_val, y_val_pred, @@ -776,12 +790,12 @@ def train_final_and_predict( # -------------------------------------------------- # Helpers # -------------------------------------------------- -def effect_size_per_feature(X2: np.ndarray, X5: np.ndarray, eps: float = 1e-12) -> np.ndarray: +def effect_size_per_feature(Xa: np.ndarray, Xb: np.ndarray, eps: float = 1e-12) -> np.ndarray: """ Computes a simple per-feature separability score between two classes. Score (Cohen-like d): - d_j = |mu2 - mu5| / sqrt( (var2 + var5)/2 ) + d_j = |mu_a - mu_b| / sqrt( (var_a + var_b)/2 ) Larger d => better separation (less overlap). Smaller d => stronger overlap. @@ -791,14 +805,14 @@ def effect_size_per_feature(X2: np.ndarray, X5: np.ndarray, eps: float = 1e-12) d : ndarray, shape (D,) Per-feature separability scores. """ - mu2 = np.mean(X2, axis=0) - mu5 = np.mean(X5, axis=0) + mu_a = np.mean(Xa, axis=0) + mu_b = np.mean(Xb, axis=0) - var2 = np.var(X2, axis=0) - var5 = np.var(X5, axis=0) + var_a = np.var(Xa, axis=0) + var_b = np.var(Xb, axis=0) - pooled = np.sqrt(0.5 * (var2 + var5) + eps) - d = np.abs(mu2 - mu5) / pooled + pooled = np.sqrt(0.5 * (var_a + var_b) + eps) + d = np.abs(mu_a - mu_b) / pooled return d @@ -814,8 +828,6 @@ def expand_param_grid(param_grid: Dict[str, List[Any]]) -> List[Dict[str, Any]]: return combos -from sklearn.model_selection import StratifiedKFold - def stratified_kfold_indices(y: np.ndarray, n_splits: int, seed: int = 0): skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) return list(skf.split(np.zeros_like(y), y)) @@ -1184,52 +1196,93 @@ TUNING_SPECS = [ # "preprocess": PREPROCESS_SPECS["scale"], # "model": "rf", # "param_grid": { - # "n_estimators": [400, 800, 1200, 1400], #[200, 400, 800], - # "max_depth": [None], #[None, 20, 40, 80], - # "max_features": ["sqrt"], #["sqrt", "log2", 0.5], - # "min_samples_split": [2, 4, 8, 10],#[2, 5, 10], - # "min_samples_leaf": [1, 2, 4], #[1, 2, 4], + # # Tuned values + # "n_estimators": [400], + # "max_depth": [None], + # "max_features": ["sqrt"], + # "min_samples_split": [4], + # "min_samples_leaf": [1], + # # + # # Tuned with the values below + # # Note: + # # Uncomment the following if you want to run the entire tuning process again! + # # ** Take a LOT of time ** + # # "n_estimators": [200, 400, 800, 1200. 1400], + # # "max_depth": [None, 20, 40, 80], + # # "max_features": ["sqrt", "log2", 0.5], + # # "min_samples_split": [2, 4, 5, 8, 10], + # # "min_samples_leaf": [1, 2, 4], + # }, + # "cv": 5, + # }, + # { + # "name": "scale + mlp", + # "preprocess_name": "scale", + # "preprocess": PREPROCESS_SPECS["scale"], + # "model": "mlp", + # "param_grid": { + # # Tuned values + # "hidden_layer_sizes": [(128,)], + # "alpha": [0.001], + # "learning_rate_init": [0.01], + # "activation": ["relu"], + # "solver": ["adam"], + # # + # # Tuned with the values below + # # Note: + # # Uncomment the following if you want to run the entire tuning process again! + # # ** Take a LOT of time ** + # # "hidden_layer_sizes": [(128, ), (128, 64), (256, 128), (128, 64, 32)], + # # "alpha": [1e-5, 1e-4, 1e-3, 0.01], + # # "learning_rate_init": [1e-4, 1e-3, 0.01, 0.02], + # # "activation": ["relu", "tanh"], + # # # "max_iter": [2000], + # # "solver": ["adam", "sgd"], + # }, + # "cv": 5, + # }, + # { + # "name": "scale_pca_85 + knn", + # "preprocess_name": "scale_pca_85", + # "preprocess": PREPROCESS_SPECS["scale_pca_85"], + # "model": "knn", + # "param_grid": { + # # Tuned values + # "n_neighbors": [9], + # "weights": ["distance"], + # "p": [2], + # # + # # Tuned with the values below + # # Note: + # # Uncomment the following if you want to run the entire tuning process again! + # # ** Take a LOT of time ** + # # "n_neighbors": [5, 7, 8, 9, 10, 11, 15, 31, 42], + # # "weights": ["uniform", "distance"], + # # "p": [1, 2], # }, # "cv": 5, # }, - { - "name": "scale + mlp", - "preprocess_name": "scale", - "preprocess": PREPROCESS_SPECS["scale"], - "model": "mlp", - "param_grid": { - "hidden_layer_sizes": [(128, ), (128, 64), (256, 128), (128, 64, 32)], - "alpha": [1e-5, 1e-4, 1e-3], - "learning_rate_init": [1e-3, 0.01, 0.02], - "activation": ["relu"], #["relu", "tanh"], - # "max_iter": [2000], - "solver": ["adam"], #["adam", "sgd"], - }, - "cv": 5, - }, - { - "name": "scale_pca_85 + knn", - "preprocess_name": "scale_pca_85", - "preprocess": PREPROCESS_SPECS["scale_pca_85"], - "model": "knn", - "param_grid": { - "n_neighbors": [7, 8, 9, 10, 11, 15, 31, 42], - "weights": ["uniform", "distance"], - "p": [1, 2], - }, - "cv": 5, - }, { "name": "scale + svm", "preprocess_name": "scale", "preprocess": PREPROCESS_SPECS["scale"], "model": "svm", "param_grid": { - "kernel": ["rbf", "poly"], - "C": [3, 4, 5, 5.5, 6, 10], - "degree": [2, 3, 5], - "gamma": ["scale", "auto"], + # Tuned values + "kernel": ["rbf"], + "C": [4], + "gamma": ["scale"], "class_weight": [None], + # + # Tuned with the values below + # Note: + # Uncomment the following if you want to run the entire tuning process again! + # ** Take a LOT of time ** + # "kernel": ["rbf", "poly"], + # "C": [0.1, 0.3, 1, 3, 4, 5, 5.5, 6, 10, 30], + # # "degree": [2, 3, 5], (only for "poly") + # "gamma": ["scale", "auto", 0.1, 0.03, 0.01, 0.003, 0.001], + # "class_weight": [None, "balanced"], }, "cv": 5, }, @@ -1262,7 +1315,7 @@ if __name__ == "__main__": # Phase 1.2: visualization visualization_phase(results, df) - # Phase 1,3: problem demo + # Phase 1.3: problem demo problem_demonstration_phase(X_train_raw, y_train, class_a=2, class_b=5, top_k=9) if param == "phase2" or param == "all": @@ -1284,7 +1337,7 @@ if __name__ == "__main__": seed=0, ) - # (Optional) also train/predict only for the best overall and save as the official submission file + # Also train/predict only for the best overall and save as the official submission file y_test_pred = train_final_and_predict( X_train_raw, y_train, X_test_raw, best_overall, labels_path="labelsX.npy" ) diff --git a/src/run1.zip b/src/run1.zip new file mode 100644 index 0000000..eded5e3 Binary files /dev/null and b/src/run1.zip differ diff --git a/src/run2.zip b/src/run2.zip new file mode 100644 index 0000000..94e3253 Binary files /dev/null and b/src/run2.zip differ diff --git a/src/run3.zip b/src/run3.zip new file mode 100644 index 0000000..bae020f Binary files /dev/null and b/src/run3.zip differ