In [1]:
# 0) Импорты
import re, numpy as np, pandas as pd, warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, jaccard_score, classification_report)
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers, initializers

warnings.filterwarnings("ignore")
tf.keras.utils.set_random_seed(42)
np.random.seed(42)


2025-11-18 13:09:58.529467: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763460598.539448   95233 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763460598.542483   95233 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763460598.551048   95233 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763460598.551057   95233 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763460598.551059   95233 computation_placer.cc:177] computation placer alr

In [2]:
# 1) Загрузка
df = pd.read_csv("/home/konnilol/Downloads/drug_consumption_transformed.csv")


In [3]:
# 2) Парсинг мультиметки
def _norm_token(t: str) -> str:
    t = str(t).strip()
    t = re.sub(r"^[\[\]\(\)\{\}\"'‘’“”\s]+|[\[\]\(\)\{\}\"'‘’“”\s]+$", "", t)
    return t


In [4]:
def parse_labels(cell):
    if pd.isna(cell):
        return []
    if isinstance(cell, (list, tuple, set, np.ndarray)):
        toks = [_norm_token(x) for x in cell]
    else:
        raw = str(cell).strip().strip("[]")
        toks = [_norm_token(x) for x in re.split(r"[;,]", raw)]
    return [t for t in toks if t]


In [5]:
assert "Used in Last Month" in df.columns
y_labels = df["Used in Last Month"].apply(parse_labels)

mlb = MultiLabelBinarizer()
Y_full = mlb.fit_transform(y_labels).astype("float32")
classes_all = mlb.classes_


In [6]:
# 3) Признаки 
feat = ["Age","Gender","Education","Country","Ethnicity",
        "Nscore","Escore","Oscore","Ascore","Cscore","Impulsive","SS"]
X_full = df[feat].astype("float32").copy()


In [7]:
# 4) «Стратиф» сплит по кардинальности мультиметки
card = Y_full.sum(axis=1)
bins = np.clip(card, 0, 6)  # бины по числу меток; усечём хвост
X_trn, X_tst, Y_trn_all, Y_tst_all = train_test_split(
    X_full, Y_full, test_size=0.2, random_state=42, stratify=bins
)


In [8]:
# 5) Дропаут редчайших меток по TRAIN (устойчивость)
pos = Y_trn_all.sum(axis=0)
neg = Y_trn_all.shape[0] - pos
keep = (pos >= 5) & (neg >= 5)
labels_kept = classes_all[keep]
labels_drop = classes_all[~keep]
if labels_drop.size:
    print(f"[info] dropped labels: {list(labels_drop)}")

Y_trn = Y_trn_all[:, keep]
Y_tst = Y_tst_all[:, keep]


[info] dropped labels: ['Semer']


In [9]:
# 6) Препроцесс признаков
imp = SimpleImputer(strategy="median")
sc  = StandardScaler()
X_trn_imp = imp.fit_transform(X_trn)
X_tst_imp = imp.transform(X_tst)
X_trn_sc  = sc.fit_transform(X_trn_imp)
X_tst_sc  = sc.transform(X_tst_imp)


In [10]:
# Внутренний валидационный сплит (для подбора порогов)
card_tr = Y_trn.sum(axis=1)
bins_tr = np.clip(card_tr, 0, 6)
X_tr, X_val, Y_tr, Y_val = train_test_split(
    X_trn_sc, Y_trn, test_size=0.1, random_state=42, stratify=bins_tr
)


In [11]:
# 7) Считаем pos_weight и bias для выхода
eps = 1e-3
pos_lab = Y_tr.sum(axis=0)
neg_lab = Y_tr.shape[0] - pos_lab
pos_frac = np.clip(pos_lab / np.maximum(pos_lab + neg_lab, 1.0), eps, 1 - eps)
out_bias = np.log(pos_frac / (1.0 - pos_frac)).astype("float32")    # logit(p)
pos_weight = (neg_lab + 1.0) / (pos_lab + 1.0)                      # ~neg/pos

pos_weight_tf = tf.constant(pos_weight.astype("float32"))

def weighted_bce(pos_w):
    def loss(y_true, y_pred):
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1.0 - eps)
        # -( w*y*log(p) + (1-y)*log(1-p) )
        term_pos = - pos_w * y_true * tf.math.log(y_pred)
        term_neg = - (1.0 - y_true) * tf.math.log(1.0 - y_pred)
        return tf.reduce_mean(term_pos + term_neg)
    return loss


I0000 00:00:1763460686.271658   95233 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 998 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:06:00.0, compute capability: 8.6


In [12]:
# 8) Модель
n_in, n_out = X_tr.shape[1], Y_tr.shape[1]
inp = layers.Input(shape=(n_in,))
x = layers.Dense(256, activation="relu", kernel_initializer="he_normal")(inp)
x = layers.Dropout(0.25)(x)
x = layers.Dense(128, activation="relu", kernel_initializer="he_normal")(x)
x = layers.Dropout(0.25)(x)
out = layers.Dense(
    n_out, activation="sigmoid",
    bias_initializer=initializers.Constant(out_bias)
)(x)


In [13]:
model = models.Model(inp, out, name="MLP_BCE_Bias")
model.compile(
    optimizer=optimizers.Adam(1e-3),
    loss=weighted_bce(pos_weight_tf)
)


In [14]:
early  = callbacks.EarlyStopping(monitor="val_loss", patience=8, restore_best_weights=True)
reduce = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=4, min_lr=1e-5)


In [15]:
model.fit(
    X_tr, Y_tr, validation_data=(X_val, Y_val),
    epochs=60, batch_size=128, callbacks=[early, reduce], verbose=0
)


I0000 00:00:1763460713.618999   96557 service.cc:152] XLA service 0x7f09e000a9a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1763460713.619011   96557 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2025-11-18 13:11:53.633540: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1763460713.714001   96557 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1763460714.255053   96557 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.






<keras.src.callbacks.history.History at 0x7f0af886fd40>

In [16]:
# 9) Прогнозы вероятностей
P_tr  = model.predict(X_tr,  batch_size=256, verbose=0)
P_val = model.predict(X_val, batch_size=256, verbose=0)
P_te  = model.predict(X_tst_sc, batch_size=256, verbose=0)























In [17]:
# 10) Пороговая логика: per-label F0.5 + safety fallback
from sklearn.metrics import fbeta_score

def best_label_thresholds(Y_true, P, beta=0.5):
    T = []
    grid = np.linspace(0.05, 0.95, 37)
    for j in range(P.shape[1]):
        yj, pj = Y_true[:, j], P[:, j]
        if yj.sum() == 0:
            T.append(0.5); continue
        best_t, best_s = 0.5, -1.0
        for t in grid:
            s = fbeta_score(yj, (pj >= t).astype(int), beta=beta, average="binary", zero_division=0)
            if s > best_s:
                best_t, best_s = t, s
        T.append(float(best_t))
    return np.array(T, dtype="float32")

def best_global_threshold_fbeta(Y_true, P, beta=0.5):
    grid = np.linspace(0.05, 0.95, 37)
    scored = [(t, fbeta_score(Y_true, (P >= t).astype(int), beta=beta, average="micro", zero_division=0))
              for t in grid]
    return max(scored, key=lambda x: x[1])

thr_vec = best_label_thresholds(Y_val, P_val, beta=0.5)
t_glob, s_glob = best_global_threshold_fbeta(Y_val, P_val, beta=0.5)
thr_vec = np.maximum(thr_vec, t_glob)   # не ниже глобального

def decode_safe(P, thr_vec, t_glob):
    pred = (P >= thr_vec).astype(int)
    # если у объекта нет ни одной метки, берём top1 только если уверенность ≥ t_glob
    row_max = P.max(axis=1)
    row_arg = P.argmax(axis=1)
    empties = np.where(pred.sum(axis=1) == 0)[0]
    use = empties[row_max[empties] >= t_glob]
    if use.size:
        pred[use, row_arg[use]] = 1
    return pred

Yp_tr = decode_safe(P_tr, thr_vec, t_glob)
Yp_te = decode_safe(P_te, thr_vec, t_glob)



In [18]:
# 11) Метрики
def eval_multilabel(name, Y_true, Y_pred, labels):
    subset_acc = accuracy_score(Y_true, Y_pred)
    micro_p = precision_score(Y_true, Y_pred, average="micro", zero_division=0)
    micro_r = recall_score(Y_true, Y_pred, average="micro", zero_division=0)
    micro_f1 = f1_score(Y_true, Y_pred, average="micro", zero_division=0)
    macro_p = precision_score(Y_true, Y_pred, average="macro", zero_division=0)
    macro_r = recall_score(Y_true, Y_pred, average="macro", zero_division=0)
    macro_f1 = f1_score(Y_true, Y_pred, average="macro", zero_division=0)
    jacc = jaccard_score(Y_true, Y_pred, average="samples", zero_division=0)

    print(f"\n=== {name} — Общие метрики ===")
    print(f"Subset Accuracy: {subset_acc:.4f}")
    print(f"Micro  P/R/F1 : {micro_p:.4f} / {micro_r:.4f} / {micro_f1:.4f}")
    print(f"Macro  P/R/F1 : {macro_p:.4f} / {macro_r:.4f} / {macro_f1:.4f}")
    print(f"Jaccard (samples): {jacc:.4f}")

    print(f"\n=== {name} — Отчёт по меткам ===")
    print(classification_report(Y_true, Y_pred, target_names=labels, digits=4, zero_division=0))


In [19]:
# === Precision-tilted decoding (no retrain) ===
from sklearn.metrics import fbeta_score
import numpy as np

def best_label_thresholds(Y_true, P, beta=0.3):
    T, grid = [], np.linspace(0.05, 0.95, 37)
    for j in range(P.shape[1]):
        yj, pj = Y_true[:, j], P[:, j]
        if yj.sum() == 0:
            T.append(0.5); continue
        scores = [(t, fbeta_score(yj, (pj >= t).astype(int), beta=beta, average="binary", zero_division=0)) for t in grid]
        T.append(max(scores, key=lambda x: x[1])[0])
    return np.array(T, dtype="float32")

def best_global_threshold_fbeta(Y_true, P, beta=0.3):
    grid = np.linspace(0.05, 0.95, 37)
    scored = [(t, fbeta_score(Y_true, (P >= t).astype(int), beta=beta, average="micro", zero_division=0)) for t in grid]
    return max(scored, key=lambda x: x[1])


In [20]:
# 1) per-label пороги под F0.3 + глобальный минимум и «зажим» снизу
t_glob, _ = best_global_threshold_fbeta(Y_val, P_val, beta=0.3)
thr_vec = best_label_thresholds(Y_val, P_val, beta=0.3)
thr_vec = np.maximum(thr_vec, max(0.60, t_glob))  # не ниже 0.60 и не ниже глобального


In [21]:
# 2) декодер: top-1 fallback только при высокой уверенности, кап=1 метка/объект
def decode_strict(P, thr_vec, t_glob, k_cap=1):
    pred = (P >= thr_vec).astype(int)
    row_max = P.max(axis=1); row_arg = P.argmax(axis=1)
    empties = np.where(pred.sum(axis=1)==0)[0]
    use = empties[row_max[empties] >= max(t_glob, 0.65)]
    if use.size:
        pred[use, row_arg[use]] = 1
    if k_cap is not None:
        too_many = np.where(pred.sum(axis=1) > k_cap)[0]
        for i in too_many:
            topk = np.argsort(-P[i])[:k_cap]
            pred[i] = 0; pred[i, topk] = 1
    return pred

Y_pred_tr = decode_strict(P_tr, thr_vec, t_glob, k_cap=1)
Y_pred_te = decode_strict(P_te, thr_vec, t_glob, k_cap=1)


In [24]:
# 3) отчёты
print(f"[decode] precision-tilted; global t={t_glob:.3f}, min per-label=0.60")


[decode] precision-tilted; global t=0.850, min per-label=0.60


In [25]:
eval_multilabel("TRAIN — MLP_BCE_Bias (prec-tilted)", Y_tr,  Y_pred_tr, labels_kept)
eval_multilabel("TEST  — MLP_BCE_Bias (prec-tilted)", Y_tst, Y_pred_te, labels_kept)
print("per-label thresholds:", dict(zip(labels_kept, np.round(thr_vec, 3))))



=== TRAIN — MLP_BCE_Bias (prec-tilted) — Общие метрики ===
Subset Accuracy: 0.4392
Micro  P/R/F1 : 0.1014 / 0.0163 / 0.0281
Macro  P/R/F1 : 0.1246 / 0.0433 / 0.0370
Jaccard (samples): 0.0074

=== TRAIN — MLP_BCE_Bias (prec-tilted) — Отчёт по меткам ===
              precision    recall  f1-score   support

     Alcohol     0.4000    0.0101    0.0196       199
      Amphet     0.1250    0.0185    0.0323        54
        Amyl     0.0750    0.1765    0.1053        17
      Benzos     0.0000    0.0000    0.0000        91
        Caff     0.0000    0.0000    0.0000        86
    Cannabis     0.0952    0.0208    0.0342        96
        Choc     0.0000    0.0000    0.0000       216
        Coke     0.3333    0.0139    0.0267        72
       Crack     0.0645    0.2857    0.1053         7
     Ecstasy     0.0000    0.0000    0.0000       112
      Heroin     0.0312    0.0769    0.0444        13
    Ketamine     0.2222    0.0588    0.0930        34
         LSD     0.2500    0.0169    0.0317

### Самый лучший результат из большого множества моделей.

#### Все равно результат никакой. Модель вообще не смогла понять зависимости и даже на трнеиривичной выборке плохой результат, то есть нельзя даже винить переобучение :(