33 changed files with 55 additions and 308 deletions
--- a/output/Team44-AC.ipynb
+++ b/output/Team44-AC.ipynb
--- a/output/Team44-D.ipynb
+++ b/output/Team44-D.ipynb
--- a/output/Team44.pdf
+++ b/output/Team44.pdf
--- a/output/Team44.zip
+++ b/output/Team44.zip
--- a/output/labelsX.npy
+++ b/output/labelsX.npy
--- a/presentation/Team44.pdf
+++ b/presentation/Team44.pdf
--- a/presentation/curvy-blue-wave-lines-background-presentation-backdrop.jpg
+++ b/presentation/curvy-blue-wave-lines-background-presentation-backdrop.jpg
--- a/presentation/partA/EstGaussianDensity.png
+++ b/presentation/partA/EstGaussianDensity.png
--- a/presentation/partA/MLE_Estimated_2d_gaussians.png
+++ b/presentation/partA/MLE_Estimated_2d_gaussians.png
--- a/presentation/partB/Dataset2_hist_vs_true.png
+++ b/presentation/partB/Dataset2_hist_vs_true.png
--- a/presentation/partB/Gaussian_kernel_h_vs_MSE.png
+++ b/presentation/partB/Gaussian_kernel_h_vs_MSE.png
--- a/presentation/partB/Uniform_kernel_h_vs_MSE.png
+++ b/presentation/partB/Uniform_kernel_h_vs_MSE.png
--- a/presentation/partC/knn_accuracy_over_k.png
+++ b/presentation/partC/knn_accuracy_over_k.png
--- a/presentation/partC/knn_decision_boundaries.png
+++ b/presentation/partC/knn_decision_boundaries.png
--- a/presentation/partD/RawData.png
+++ b/presentation/partD/RawData.png
--- a/presentation/partD/feature_dist_overlap_2_vs_5.png
+++ b/presentation/partD/feature_dist_overlap_2_vs_5.png
--- a/src/figures/Confusion
+++ b/src/figures/Confusion
--- a/src/figures/Confusion
+++ b/src/figures/Confusion
--- a/src/figures/Confusion
+++ b/src/figures/Confusion
--- a/src/figures/Confusion
+++ b/src/figures/Confusion
--- a/src/figures/tuning_results.csv
+++ b/src/figures/tuning_results.csv
@ -1,2 +0,0 @@
-config,preprocess,model,params,mean_acc,std_acc
-scale + svm,scale,svm,"{'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None}",0.8581719138625145,0.013348223889216927
--- a/src/labelsX.npy
+++ b/src/labelsX.npy
--- a/src/labelsX_scale_mlp.npy
+++ b/src/labelsX_scale_mlp.npy
--- a/src/labelsX_scale_pca_85_knn.npy
+++ b/src/labelsX_scale_pca_85_knn.npy
--- a/src/labelsX_scale_rf.npy
+++ b/src/labelsX_scale_rf.npy
--- a/src/labelsX_scale_svm.npy
+++ b/src/labelsX_scale_svm.npy
--- a/src/output.txt
+++ b/src/output.txt
@ -1,185 +0,0 @@
-/home/hoo2/Work/AUTh/PatternRecognition/Assignment_2025-26/.venv/bin/python /home/hoo2/Work/AUTh/PatternRecognition/Assignment_2025-26/src/partD.py all 
-[       scale] [       gnb] val_acc=0.7095
-[       scale] [        rf] val_acc=0.8205
-[       scale] [    logreg] val_acc=0.7730
-[       scale] [linear_svm] val_acc=0.7707
-[       scale] [       svm] val_acc=0.8593
-[       scale] [       mlp] val_acc=0.8382
-[       scale] [       knn] val_acc=0.8342
-[       scale] [  adaboost] val_acc=0.6832
-[scale_pca_66] [       gnb] val_acc=0.7524
-[scale_pca_66] [        rf] val_acc=0.8096
-[scale_pca_66] [    logreg] val_acc=0.7862
-[scale_pca_66] [linear_svm] val_acc=0.7736
-[scale_pca_66] [       svm] val_acc=0.8582
-[scale_pca_66] [       mlp] val_acc=0.8359
-[scale_pca_66] [       knn] val_acc=0.8370
-[scale_pca_66] [  adaboost] val_acc=0.6878
-[scale_pca_75] [       gnb] val_acc=0.7547
-[scale_pca_75] [        rf] val_acc=0.8130
-[scale_pca_75] [    logreg] val_acc=0.7839
-[scale_pca_75] [linear_svm] val_acc=0.7696
-[scale_pca_75] [       svm] val_acc=0.8565
-[scale_pca_75] [       mlp] val_acc=0.8216
-[scale_pca_75] [       knn] val_acc=0.8370
-[scale_pca_75] [  adaboost] val_acc=0.6878
-[scale_pca_85] [       gnb] val_acc=0.7501
-[scale_pca_85] [        rf] val_acc=0.8033
-[scale_pca_85] [    logreg] val_acc=0.7810
-[scale_pca_85] [linear_svm] val_acc=0.7662
-[scale_pca_85] [       svm] val_acc=0.8588
-[scale_pca_85] [       mlp] val_acc=0.8188
-[scale_pca_85] [       knn] val_acc=0.8388
-[scale_pca_85] [  adaboost] val_acc=0.6998
-
-=== Investigation summary ===
-model
-svm           0.859348
-knn           0.838765
-mlp           0.838193
-rf            0.820469
-logreg        0.786164
-linear_svm    0.773585
-gnb           0.754717
-adaboost      0.699828
-
-Selected top-3 models for further analysis: ['svm', 'knn', 'mlp']
-
-Best configuration overall: preprocess=scale, model=svm, val_acc=0.8593
-
-Classification report (best config):
-              precision    recall  f1-score   support
-
-           1       0.94      0.96      0.95       354
-           2       0.76      0.73      0.75       344
-           3       0.92      0.93      0.93       351
-           4       0.91      0.91      0.91       343
-           5       0.75      0.77      0.76       357
-
-    accuracy                           0.86      1749
-   macro avg       0.86      0.86      0.86      1749
-weighted avg       0.86      0.86      0.86      1749
-
-
-[TUNING] scale + rf (cv=5) ...
-[scale | rf] combo   1/1 mean=0.8228 params={'n_estimators': 400, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 1}
-  best mean_acc=0.8228 (std=0.0121) params={'n_estimators': 400, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 1}
-
-[TUNING] scale + mlp (cv=5) ...
-[scale | mlp] combo   1/1 mean=0.8407 params={'hidden_layer_sizes': (128,), 'alpha': 0.001, 'learning_rate_init': 0.01, 'activation': 'relu', 'solver': 'adam'}
-  best mean_acc=0.8407 (std=0.0098) params={'hidden_layer_sizes': (128,), 'alpha': 0.001, 'learning_rate_init': 0.01, 'activation': 'relu', 'solver': 'adam'}
-
-[TUNING] scale_pca_85 + knn (cv=5) ...
-[scale_pca_85 | knn] combo   1/1 mean=0.8313 params={'n_neighbors': 9, 'weights': 'distance', 'p': 2}
-  best mean_acc=0.8313 (std=0.0117) params={'n_neighbors': 9, 'weights': 'distance', 'p': 2}
-
-[TUNING] scale + svm (cv=5) ...
-[scale | svm] combo   1/1 mean=0.8582 params={'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None}
-  best mean_acc=0.8582 (std=0.0133) params={'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None}
-
-=== Tuning summary (best overall) ===
-{'name': 'scale + svm', 'preprocess_spec': {'type': 'pipeline', 'steps': [{'type': 'scaler', 'params': {}}]}, 'preprocess_name': 'scale', 'model': 'svm', 'params': {'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None}, 'mean_acc': 0.8581719138625145, 'std_acc': 0.013348223889216927}
-
-============================================================
-[FINAL - VALIDATION] scale + rf
-Confusion matrix:
-[[338   6   5   3   2]
- [  4 239  11  12  78]
- [ 11   2 316  21   1]
- [  3  12  17 299  12]
- [ 13  71   4   9 260]]
-
-Classification report:
-              precision    recall  f1-score   support
-
-           1       0.92      0.95      0.93       354
-           2       0.72      0.69      0.71       344
-           3       0.90      0.90      0.90       351
-           4       0.87      0.87      0.87       343
-           5       0.74      0.73      0.73       357
-
-    accuracy                           0.83      1749
-   macro avg       0.83      0.83      0.83      1749
-weighted avg       0.83      0.83      0.83      1749
-
-============================================================
-[FINAL] scale_rf: saved labelsX_scale_rf.npy shape=(6955,)
-
-============================================================
-[FINAL - VALIDATION] scale + mlp
-Confusion matrix:
-[[338   1   9   2   4]
- [  5 244  13   7  75]
- [ 10   3 320  16   2]
- [  0  14  16 302  11]
- [  8  74   1  16 258]]
-
-Classification report:
-              precision    recall  f1-score   support
-
-           1       0.94      0.95      0.95       354
-           2       0.73      0.71      0.72       344
-           3       0.89      0.91      0.90       351
-           4       0.88      0.88      0.88       343
-           5       0.74      0.72      0.73       357
-
-    accuracy                           0.84      1749
-   macro avg       0.83      0.84      0.83      1749
-weighted avg       0.83      0.84      0.84      1749
-
-============================================================
-[FINAL] scale_mlp: saved labelsX_scale_mlp.npy shape=(6955,)
-
-============================================================
-[FINAL - VALIDATION] scale_pca_85 + knn
-Confusion matrix:
-[[346   2   5   0   1]
- [  5 193   9   7 130]
- [ 19   1 319  11   1]
- [  4   9  17 301  12]
- [  8  33   1   6 309]]
-
-Classification report:
-              precision    recall  f1-score   support
-
-           1       0.91      0.98      0.94       354
-           2       0.81      0.56      0.66       344
-           3       0.91      0.91      0.91       351
-           4       0.93      0.88      0.90       343
-           5       0.68      0.87      0.76       357
-
-    accuracy                           0.84      1749
-   macro avg       0.85      0.84      0.84      1749
-weighted avg       0.85      0.84      0.84      1749
-
-============================================================
-[FINAL] scale_pca_85_knn: saved labelsX_scale_pca_85_knn.npy shape=(6955,)
-
-============================================================
-[FINAL - VALIDATION] scale + svm
-Confusion matrix:
-[[340   2   8   1   3]
- [  3 251   9   6  75]
- [  7   1 327  14   2]
- [  0  12   9 311  11]
- [ 11  63   1   8 274]]
-
-Classification report:
-              precision    recall  f1-score   support
-
-           1       0.94      0.96      0.95       354
-           2       0.76      0.73      0.75       344
-           3       0.92      0.93      0.93       351
-           4       0.91      0.91      0.91       343
-           5       0.75      0.77      0.76       357
-
-    accuracy                           0.86      1749
-   macro avg       0.86      0.86      0.86      1749
-weighted avg       0.86      0.86      0.86      1749
-
-============================================================
-[FINAL] scale_svm: saved labelsX_scale_svm.npy shape=(6955,)
-Saved labels to labelsX.npy with shape (6955,)
-
-Process finished with exit code 0
-
--- a/src/partA.py
+++ b/src/partA.py
@ -197,17 +197,6 @@ def plot_gaussians_3d(
    ax.set_zlabel("pdf")
    plt.show()

-    # plt.figure(figsize=(6, 5))
-    # plt.scatter(X[:, 0], X[:, 1], s=10, alpha=0.35)
-    # plt.contour(Xgrid, Ygrid, Z, levels=8, linewidths=1.5)
-    #
-    # plt.title("Estimated Gaussian density (ML)")
-    # plt.xlabel("x₁")
-    # plt.ylabel("x₂")
-    #
-    # plt.tight_layout()
-    # plt.show()
-


 # --------------------------------------------------
--- a/src/partB.py
+++ b/src/partB.py
@ -302,7 +302,7 @@ def plot_histogram_with_pdf(
    plt.plot(x_plot, pdf_true, label=f"True N({mu_true}, {var_true}) pdf")
    plt.xlabel("x")
    plt.ylabel("Density")
-    plt.title(f"Dataset2 histogram vs true N({mu_true}, {var_true}) pdf")
+    plt.title("Dataset2 histogram vs true N({mu_true}, {var_true}) pdf")
    plt.legend()
    plt.grid(True)
    plt.show()
--- a/src/partD.py
+++ b/src/partD.py
@ -33,8 +33,6 @@ import matplotlib as mpl
 import matplotlib.pyplot as plt

 from sklearn.model_selection import train_test_split
-from sklearn.model_selection import StratifiedKFold
-
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.decomposition import PCA
@ -391,9 +389,8 @@ def plot_accuracy_bars(df: pd.DataFrame, title: str) -> None:
    plt.grid(True, axis="y", alpha=0.3)
    plt.legend()
    plt.tight_layout()
-    plt.savefig(f"figures/" + title + ".png", dpi=300)
    plt.show(block=False)
-    plt.pause(2)
+    plt.savefig(f"figures/" + title + ".png", dpi=300)
    plt.close()


@ -407,9 +404,8 @@ def plot_confusion(y_true: np.ndarray, y_pred: np.ndarray, title: str) -> None:
    disp.plot(ax=ax, cmap="Blues", colorbar=True)
    ax.set_title(title)
    plt.tight_layout()
-    plt.savefig(f"figures/" + title + ".png", dpi=300)
    plt.show(block=False)
-    plt.pause(2)
+    plt.savefig(f"figures/" + title + ".png", dpi=300)
    plt.close()


@ -461,7 +457,7 @@ def plot_pca_scatter_2d(
        bbox_inches="tight",
    )
    plt.show(block=False)
-    plt.pause(2)
+    plt.pause(0.001)
    plt.close()


@ -501,7 +497,7 @@ def plot_feature_separability(
        bbox_inches="tight",
    )
    plt.show(block=False)
-    plt.pause(2)
+    plt.pause(0.001)
    plt.close()

    # Plot worst
@ -518,7 +514,7 @@ def plot_feature_separability(
        bbox_inches="tight",
    )
    plt.show(block=False)
-    plt.pause(2)
+    plt.pause(0.001)
    plt.close()

    return best_idx, worst_idx
@ -558,7 +554,7 @@ def plot_feature_distributions_grid(
        bbox_inches="tight",
    )
    plt.show(block=False)
-    plt.pause(2)
+    plt.pause(0.001)
    plt.close()


@ -715,16 +711,6 @@ def final_training_for_all_best_configs(
        model = train_classifier(X_tr_p, y_tr, model_spec)
        y_val_pred = model.predict(X_val_p).astype(int)

-        # --- console output: confusion matrix + report ---
-        cm = confusion_matrix(y_val, y_val_pred)
-        print("\n" + "=" * 60)
-        print(f"[FINAL - VALIDATION] {preprocess_name} + {model_key}")
-        print("Confusion matrix:")
-        print(cm)
-        print("\nClassification report:")
-        print(classification_report(y_val, y_val_pred))
-        print("=" * 60)
-
        plot_confusion(
            y_val,
            y_val_pred,
@ -790,12 +776,12 @@ def train_final_and_predict(
 # --------------------------------------------------
 # Helpers
 # --------------------------------------------------
-def effect_size_per_feature(Xa: np.ndarray, Xb: np.ndarray, eps: float = 1e-12) -> np.ndarray:
+def effect_size_per_feature(X2: np.ndarray, X5: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    """
    Computes a simple per-feature separability score between two classes.

    Score (Cohen-like d):
-        d_j = |mu_a - mu_b| / sqrt( (var_a + var_b)/2 )
+        d_j = |mu2 - mu5| / sqrt( (var2 + var5)/2 )

    Larger d => better separation (less overlap).
    Smaller d => stronger overlap.
@ -805,14 +791,14 @@ def effect_size_per_feature(Xa: np.ndarray, Xb: np.ndarray, eps: float = 1e-12)
    d : ndarray, shape (D,)
        Per-feature separability scores.
    """
-    mu_a = np.mean(Xa, axis=0)
-    mu_b = np.mean(Xb, axis=0)
+    mu2 = np.mean(X2, axis=0)
+    mu5 = np.mean(X5, axis=0)

-    var_a = np.var(Xa, axis=0)
-    var_b = np.var(Xb, axis=0)
+    var2 = np.var(X2, axis=0)
+    var5 = np.var(X5, axis=0)

-    pooled = np.sqrt(0.5 * (var_a + var_b) + eps)
-    d = np.abs(mu_a - mu_b) / pooled
+    pooled = np.sqrt(0.5 * (var2 + var5) + eps)
+    d = np.abs(mu2 - mu5) / pooled
    return d


@ -828,6 +814,8 @@ def expand_param_grid(param_grid: Dict[str, List[Any]]) -> List[Dict[str, Any]]:
    return combos


+from sklearn.model_selection import StratifiedKFold
+
 def stratified_kfold_indices(y: np.ndarray, n_splits: int, seed: int = 0):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    return list(skf.split(np.zeros_like(y), y))
@ -1196,93 +1184,52 @@ TUNING_SPECS = [
    #     "preprocess": PREPROCESS_SPECS["scale"],
    #     "model": "rf",
    #     "param_grid": {
-    #         # Tuned values
-    #         "n_estimators":      [400],
-    #         "max_depth":         [None],
-    #         "max_features":      ["sqrt"],
-    #         "min_samples_split": [4],
-    #         "min_samples_leaf":  [1],
-    #         #
-    #         # Tuned with the values below
-    #         #   Note:
-    #         #       Uncomment the following if you want to run the entire tuning process again!
-    #         #       ** Take a LOT of time **
-    #         # "n_estimators":      [200, 400, 800, 1200. 1400],
-    #         # "max_depth":         [None, 20, 40, 80],
-    #         # "max_features":      ["sqrt", "log2", 0.5],
-    #         # "min_samples_split": [2, 4, 5, 8, 10],
-    #         # "min_samples_leaf":  [1, 2, 4],
-    #     },
-    #     "cv": 5,
-    # },
-    # {
-    #     "name": "scale + mlp",
-    #     "preprocess_name": "scale",
-    #     "preprocess": PREPROCESS_SPECS["scale"],
-    #     "model": "mlp",
-    #     "param_grid": {
-    #         # Tuned values
-    #         "hidden_layer_sizes": [(128,)],
-    #         "alpha":              [0.001],
-    #         "learning_rate_init": [0.01],
-    #         "activation":         ["relu"],
-    #         "solver":             ["adam"],
-    #         #
-    #         # Tuned with the values below
-    #         #   Note:
-    #         #       Uncomment the following if you want to run the entire tuning process again!
-    #         #       ** Take a LOT of time **
-    #         # "hidden_layer_sizes": [(128, ), (128, 64), (256, 128), (128, 64, 32)],
-    #         # "alpha":              [1e-5, 1e-4, 1e-3, 0.01],
-    #         # "learning_rate_init": [1e-4, 1e-3, 0.01, 0.02],
-    #         # "activation":         ["relu", "tanh"],
-    #         # # "max_iter":           [2000],
-    #         # "solver":             ["adam", "sgd"],
-    #     },
-    #     "cv": 5,
-    # },
-    # {
-    #     "name": "scale_pca_85 + knn",
-    #     "preprocess_name": "scale_pca_85",
-    #     "preprocess": PREPROCESS_SPECS["scale_pca_85"],
-    #     "model": "knn",
-    #     "param_grid": {
-    #         # Tuned values
-    #         "n_neighbors": [9],
-    #         "weights": ["distance"],
-    #         "p": [2],
-    #         #
-    #         # Tuned with the values below
-    #         #   Note:
-    #         #       Uncomment the following if you want to run the entire tuning process again!
-    #         #       ** Take a LOT of time **
-    #         # "n_neighbors": [5, 7, 8, 9, 10, 11, 15, 31, 42],
-    #         # "weights":     ["uniform", "distance"],
-    #         # "p":           [1, 2],
+    #         "n_estimators": [400, 800, 1200, 1400], #[200, 400, 800],
+    #         "max_depth": [None], #[None, 20, 40, 80],
+    #         "max_features": ["sqrt"], #["sqrt", "log2", 0.5],
+    #         "min_samples_split": [2, 4, 8, 10],#[2, 5, 10],
+    #         "min_samples_leaf": [1, 2, 4], #[1, 2, 4],
    #     },
    #     "cv": 5,
    # },
+    {
+        "name": "scale + mlp",
+        "preprocess_name": "scale",
+        "preprocess": PREPROCESS_SPECS["scale"],
+        "model": "mlp",
+        "param_grid": {
+            "hidden_layer_sizes": [(128, ), (128, 64), (256, 128), (128, 64, 32)],
+            "alpha": [1e-5, 1e-4, 1e-3],
+            "learning_rate_init": [1e-3, 0.01, 0.02],
+            "activation": ["relu"], #["relu", "tanh"],
+            # "max_iter": [2000],
+            "solver": ["adam"], #["adam", "sgd"],
+        },
+        "cv": 5,
+    },
+    {
+        "name": "scale_pca_85 + knn",
+        "preprocess_name": "scale_pca_85",
+        "preprocess": PREPROCESS_SPECS["scale_pca_85"],
+        "model": "knn",
+        "param_grid": {
+            "n_neighbors": [7, 8, 9, 10, 11, 15, 31, 42],
+            "weights": ["uniform", "distance"],
+            "p": [1, 2],
+        },
+        "cv": 5,
+    },
    {
        "name": "scale + svm",
        "preprocess_name": "scale",
        "preprocess": PREPROCESS_SPECS["scale"],
        "model": "svm",
        "param_grid": {
-            # Tuned values
-            "kernel":       ["rbf"],
-            "C":            [4],
-            "gamma":        ["scale"],
+            "kernel": ["rbf", "poly"],
+            "C": [3, 4, 5, 5.5, 6, 10],
+            "degree": [2, 3, 5],
+            "gamma": ["scale", "auto"],
            "class_weight": [None],
-            #
-            # Tuned with the values below
-            #   Note:
-            #       Uncomment the following if you want to run the entire tuning process again!
-            #       ** Take a LOT of time **
-            # "kernel":       ["rbf", "poly"],
-            # "C":            [0.1, 0.3, 1, 3, 4, 5, 5.5, 6, 10, 30],
-            # # "degree":       [2, 3, 5], (only for "poly")
-            # "gamma":        ["scale", "auto", 0.1, 0.03, 0.01, 0.003, 0.001],
-            # "class_weight": [None, "balanced"],
        },
        "cv": 5,
    },
@ -1315,7 +1262,7 @@ if __name__ == "__main__":
        # Phase 1.2: visualization
        visualization_phase(results, df)

-        # Phase 1.3: problem demo
+        # Phase 1,3: problem demo
        problem_demonstration_phase(X_train_raw, y_train, class_a=2, class_b=5, top_k=9)

    if param == "phase2" or param == "all":
@ -1337,7 +1284,7 @@ if __name__ == "__main__":
            seed=0,
        )

-        # Also train/predict only for the best overall and save as the official submission file
+        # (Optional) also train/predict only for the best overall and save as the official submission file
        y_test_pred = train_final_and_predict(
            X_train_raw, y_train, X_test_raw, best_overall, labels_path="labelsX.npy"
        )
--- a/src/run1.zip
+++ b/src/run1.zip
--- a/src/run2.zip
+++ b/src/run2.zip
--- a/src/run3.zip
+++ b/src/run3.zip