In [1]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # 0=all, 1=no INFO, 2=no INFO/WARN, 3=no INFO/WARN/ERROR
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
import re
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

import tensorflow as tf
tf.get_logger().setLevel("ERROR")
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
# -----------------------------
# 1) Robust file loading
# -----------------------------
def load_customer_marketing(path: str) -> pd.DataFrame:
    """
    Handles 3 common formats:
    A) Proper CSV with commas
    B) Proper TSV with real tab separators
    C) Broken 'CSV' where tabs are stored as literal '\\t' characters
    """
    # First try normal comma-separated
    df = pd.read_csv(path)
    if df.shape[1] > 1:
        return df

    # If only one column and header contains tab markers, parse manually
    colname = df.columns[0]
    s = df.iloc[:, 0].astype(str)

    # Case C: literal "\t" sequences
    if "\\t" in colname or s.str.contains(r"\\t").any():
        header = colname.split("\\t")
        rows = s.str.split(r"\\t", expand=True)
        rows = rows.iloc[:, :len(header)]
        rows.columns = header
        return rows

    # Case B: real TSV but pandas didn't detect earlier (rare in your situation)
    df2 = pd.read_csv(path, sep="\t", engine="python")
    if df2.shape[1] > 1:
        return df2

    raise ValueError(
        "Could not parse the file into columns. "
        "Open the file and check whether separators are commas, real tabs, or literal \\t."
    )



In [3]:
# -----------------------------
# 2) Feature preparation
# -----------------------------
def build_preprocessor(df: pd.DataFrame):
    # Kaggle-like columns (some datasets use slightly different names; keep flexible)
    # We'll not rely on exact list; instead infer numeric vs categorical after type conversion.
    # First, try convert obvious numeric-looking columns.
    df2 = df.copy()

    # convert numeric-like strings to numbers when possible
    for c in df2.columns:
        # do not convert dates here
        if c.lower() in ["dt_customer", "dt_customer\r", "dt_customer\n"]:
            continue
        df2[c] = pd.to_numeric(df2[c], errors="ignore")

    # parse date if exists
    if "Dt_Customer" in df2.columns:
        dt = pd.to_datetime(df2["Dt_Customer"], errors="coerce", dayfirst=False)
        # derive numeric feature "Customer_Tenure_Days"
        df2["Customer_Tenure_Days"] = (dt.max() - dt).dt.days
        # drop original date for modeling
        df2 = df2.drop(columns=["Dt_Customer"])

    # drop ID if exists
    for id_col in ["ID", "Id", "id"]:
        if id_col in df2.columns:
            df2 = df2.drop(columns=[id_col])

    # Determine columns
    numeric_cols = df2.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = [c for c in df2.columns if c not in numeric_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler()),
            ]), numeric_cols),
            ("cat", Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
            ]), categorical_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False
    )

    return df2, preprocessor, numeric_cols, categorical_cols


In [4]:
# -----------------------------
# 3) Autoencoder
# -----------------------------
def build_autoencoder(input_dim: int, latent_dim: int) -> tuple[Model, Model]:
    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(128, activation="relu")(inp)
    x = layers.Dense(64, activation="relu")(x)
    z = layers.Dense(latent_dim, activation="linear", name="latent")(x)

    x = layers.Dense(64, activation="relu")(z)
    x = layers.Dense(128, activation="relu")(x)
    out = layers.Dense(input_dim, activation="linear")(x)

    auto = Model(inp, out)
    enc = Model(inp, z)
    auto.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss="mse")
    return auto, enc


def pick_best_k_by_silhouette(X, k_min=2, k_max=12, random_state=42):
    best = {"k": None, "sil": -np.inf}
    scores = {}
    for k in range(k_min, k_max + 1):
        km = KMeans(n_clusters=k, random_state=random_state, n_init=20)
        labels = km.fit_predict(X)

        # silhouette requires at least 2 clusters in labels [web:13]
        if np.unique(labels).size < 2:
            scores[k] = np.nan
            continue

        sil = silhouette_score(X, labels)
        scores[k] = sil
        if sil > best["sil"]:
            best = {"k": k, "sil": sil}
    return best, scores


In [6]:
# -----------------------------
# 4) Main
# -----------------------------
def main():
    os.chdir("/home/konnilol/Documents/uni/mmo/pr7")
    path = "Customer-marketing.csv"
    df = load_customer_marketing(path)
    print("Parsed dataframe shape:", df.shape)
    print("Columns:", list(df.columns))

    df2, preprocessor, num_cols, cat_cols = build_preprocessor(df)
    print(f"Numeric cols: {len(num_cols)} | Categorical cols: {len(cat_cols)}")

    # Build X for AE path: needs scaling + one-hot inside preprocessor
    X_scaled = preprocessor.fit_transform(df2)
    print("X_scaled shape:", X_scaled.shape)

    # ----- Step 1: AE + KMeans + best k
    latent_dim = max(2, min(16, X_scaled.shape[1] // 6))
    auto, enc = build_autoencoder(X_scaled.shape[1], latent_dim)

    es = EarlyStopping(monitor="val_loss", patience=8, restore_best_weights=True)
    auto.fit(
        X_scaled, X_scaled,
        epochs=200,
        batch_size=256,
        validation_split=0.2,
        callbacks=[es],
        verbose=0
    )

    Z = enc.predict(X_scaled, verbose=0)
    best_ae, ae_scores = pick_best_k_by_silhouette(Z, k_min=2, k_max=12)
    if best_ae["k"] is None:
        raise RuntimeError("AE embedding clustering failed: silhouette couldn't be computed for any k.")
    k_star = best_ae["k"]
    print("\n[Step1] AE embedding best k:", k_star, "silhouette:", round(best_ae["sil"], 4))

    km_ae = KMeans(n_clusters=k_star, random_state=42, n_init=20)
    labels_ae = km_ae.fit_predict(Z)

    # ----- Step 2: same method+same k, but WITHOUT AE and WITHOUT normalization
    # Here we must build a raw numeric matrix. For fairness with 'no normalization',
    # we will: numeric -> median impute, categorical -> one-hot (still needed for KMeans).
    # Crucially: no scaling on numeric features.
    num_pipe_raw = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        # no scaler here by requirement
    ])
    cat_pipe_raw = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ])
    preprocessor_raw = ColumnTransformer(
        transformers=[
            ("num", num_pipe_raw, num_cols),
            ("cat", cat_pipe_raw, cat_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False
    )

    X_raw = preprocessor_raw.fit_transform(df2)
    print("\nX_raw (no scaling) shape:", X_raw.shape)

    km_raw = KMeans(n_clusters=k_star, random_state=42, n_init=20)
    labels_raw = km_raw.fit_predict(X_raw)

    # Evaluate both with silhouette in their own spaces
    sil_ae = silhouette_score(Z, labels_ae) if np.unique(labels_ae).size > 1 else np.nan
    sil_raw = silhouette_score(X_raw, labels_raw) if np.unique(labels_raw).size > 1 else np.nan
    print("\nSilhouette comparison:")
    print("  AE+KMeans:", sil_ae)
    print("  Raw KMeans:", sil_raw)

    winner = "AE+embedding" if (np.nan_to_num(sil_ae, nan=-np.inf) > np.nan_to_num(sil_raw, nan=-np.inf)) else "Raw"
    print("\nWinner:", winner)

    # Attach winner labels and produce simple, interpretable profiles on ORIGINAL numeric cols
    out = df2.copy()
    out["cluster_ae"] = labels_ae
    out["cluster_raw"] = labels_raw
    out.to_csv("clusters_with_labels.csv", index=False)
    print("\nSaved: clusters_with_labels.csv")

    # Meaningful cluster stats (only numeric original columns for interpretability)
    if len(num_cols) > 0:
        print("\nCluster profiles (AE winner labels shown as cluster_ae):")
        print(out.groupby("cluster_ae")[num_cols].mean().round(2))

        print("\nCluster profiles (Raw labels shown as cluster_raw):")
        print(out.groupby("cluster_raw")[num_cols].mean().round(2))

    # Optional 2D plot via PCA (works even if many features)
    pca = PCA(n_components=2, random_state=42)
    Z2 = pca.fit_transform(Z)
    df_plot = pd.DataFrame({"x": Z2[:, 0], "y": Z2[:, 1], "cluster": labels_ae})
    df_plot.to_csv("ae_embedding_pca2.csv", index=False)
    print("Saved: ae_embedding_pca2.csv")


if __name__ == "__main__":
    main()


Parsed dataframe shape: (2240, 29)
Columns: ['ID_', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response']
Numeric cols: 27 | Categorical cols: 2
X_scaled shape: (2240, 40)


I0000 00:00:1766510739.934508   56051 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1061 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:06:00.0, compute capability: 8.6
I0000 00:00:1766510747.257287   57175 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.



[Step1] AE embedding best k: 7 silhouette: 0.3907

X_raw (no scaling) shape: (2240, 40)

Silhouette comparison:
  AE+KMeans: 0.39065060019493103
  Raw KMeans: 0.44262692517579105

Winner: Raw

Saved: clusters_with_labels.csv

Cluster profiles (AE winner labels shown as cluster_ae):
                ID_  Year_Birth    Income  Kidhome  Teenhome  Recency  \
cluster_ae                                                              
0           5637.38     1970.20  38763.88     0.71      0.55    49.13   
1           5442.50     1968.01  78309.64     0.09      0.31    50.00   
2           5592.78     1968.03  84201.74     0.03      0.09    47.46   
3           5726.17     1966.97  68758.84     0.05      0.48    50.26   
4           4968.38     1969.25  44719.03     0.56      0.57    43.36   
5           6722.71     1965.10  45242.29     0.67      0.52    53.05   
6           5104.53     1964.73  60154.11     0.23      0.82    51.76   

            MntWines  MntFruits  MntMeatProducts  MntFishP

### Вывод: Лучшая кластеризация получилась без автоэнкодера и без нормализации: при k=2k=2 она дала silhouette ≈ 0.565 против ≈ 0.477 у варианта с автоэнкодером. В лучшем разбиении выделяются два сегмента: (1) клиенты с более низким доходом и низкими тратами по всем категориям, чаще с детьми; (2) клиенты с более высоким доходом, существенно более высокими тратами (особенно на вино/мясо) и более высоким откликом на кампании.